Spaces:

aravindkb7
/

SourceTruth_Test

Sleeping

App Files Files Community

SourceTruth_Test / app.py

aravindkb7

Upload app.py

25e0d89 verified 1 day ago

raw

history blame contribute delete

55.4 kB

	import hashlib
	import html
	import json
	import os
	import re
	import shutil
	import tempfile
	import threading
	import time
	import uuid
	from dataclasses import dataclass, field
	from pathlib import Path
	from typing import Dict, List, Optional, Tuple

	import faiss
	import gradio as gr
	from sentence_transformers import SentenceTransformer

	from extracted_phase2_core import AgenticSelfRAG, Chunk, K_PASSAGES


	APP_NAME = "SourceTruth"
	APP_TAGLINE = "Ask grounded questions over the preloaded Phase 2 project corpus and inspect cited evidence."

	APP_ROOT = Path(__file__).resolve().parent
	UPLOAD_ROOT = APP_ROOT / "testing_uploads"
	LOG_ROOT = APP_ROOT / "testing_logs"
	EVENT_LOG_PATH = LOG_ROOT / "events.jsonl"
	INTERACTION_LOG_PATH = LOG_ROOT / "interactions.jsonl"
	CORPUS_CANDIDATES = [
	APP_ROOT / "phase2_corpus",
	APP_ROOT / "phase 2 corpus",
	APP_ROOT,
	]
	LOCAL_CORPUS_DIR = os.getenv("LOCAL_CORPUS_DIR", "").strip()
	if LOCAL_CORPUS_DIR:
	CORPUS_CANDIDATES.append(Path(LOCAL_CORPUS_DIR).expanduser())

	MAX_FILE_SIZE_MB = int(os.getenv("MAX_FILE_SIZE_MB", "20"))
	MAX_PAGES = int(os.getenv("MAX_PAGES", "75"))
	MAX_CHUNKS = int(os.getenv("MAX_CHUNKS", "250"))
	CHUNK_WORDS = int(os.getenv("CHUNK_WORDS", "300"))
	CHUNK_OVERLAP = int(os.getenv("CHUNK_OVERLAP", "50"))
	SESSION_TTL_SECONDS = int(os.getenv("SESSION_TTL_SECONDS", str(30 * 60)))
	MAX_QUESTIONS_PER_MINUTE = int(os.getenv("MAX_QUESTIONS_PER_MINUTE", "8"))
	QUEUE_CONCURRENCY = int(os.getenv("QUEUE_CONCURRENCY", "2"))
	QUEUE_MAX_SIZE = int(os.getenv("QUEUE_MAX_SIZE", "20"))
	LOAD_IN_4BIT = os.getenv("LOAD_IN_4BIT", "0") == "1"
	MAX_SUMMARY_SENTENCES = int(os.getenv("MAX_SUMMARY_SENTENCES", "3"))

	PRIVACY_NOTICE = (
	"The preloaded project PDFs are processed only to answer your questions and produce citations. "
	"Documents are not used to train models. Interaction logs may store the question, answer, citation, "
	"and proxy evaluation metrics for testing analysis. Avoid using the application for confidential, "
	"personal, medical, or legal decisions without direct document verification."
	)

	CSS = """
	.gradio-container {
	background:
	radial-gradient(circle at top left, rgba(59,130,246,0.08), transparent 28%),
	radial-gradient(circle at top right, rgba(16,185,129,0.08), transparent 22%),
	linear-gradient(180deg, #f8fbff 0%, #f4f7fb 100%);
	}
	#ask_btn {
	background: linear-gradient(135deg, #0f172a 0%, #1d4ed8 100%) !important;
	color: white !important;
	border: none !important;
	}
	"""

	PERSON_RE = re.compile(r"\b(?:Dr\.?\s+)?[A-Z][a-z]+(?:\s+[A-Z][a-z]+){1,2}\b")
	CURRENCY_RE = re.compile(r"(?:₹\s[\d,]+(?:\.\d+)?\|\b(?:INR\|Rs\.?)\s[\d,]+(?:\.\d+)?)", re.I)
	AMOUNT_RE = re.compile(r"\b(?:paid amount\|amount paid\|total price\|price\|amount\|budget\|cost)\b[:\s-](₹\|INR\|Rs\.?)?\s([\d,]+(?:\.\d+)?)", re.I)
	VERSION_RE = re.compile(r"\b\d+(?:\.\d+){1,3}\b")
	DATE_RE = re.compile(
	r"\b\d{1,2}\s+(?:January\|February\|March\|April\|May\|June\|July\|August\|"
	r"September\|October\|November\|December)\s+\d{4}\b",
	flags=re.IGNORECASE,
	)
	NUMBER_RE = re.compile(r"\b\d[\d,]*(?:\.\d+)?\b")
	ROLE_NOUNS = {"guide", "supervisor", "advisor", "mentor", "approver", "director", "lead", "manager"}
	DOC_NAME_HINTS = {
	"project charter": "01_Project_Charter.pdf",
	"validation master plan": "02_Validation_Master_Plan.pdf",
	"vmp": "02_Validation_Master_Plan.pdf",
	"user requirements specification": "03_User_Requirements_Specification.pdf",
	"urs": "03_User_Requirements_Specification.pdf",
	"functional requirements specification": "04_Functional_Requirements_Specification.pdf",
	"frs": "04_Functional_Requirements_Specification.pdf",
	"risk assessment": "05_Risk_Assessment.pdf",
	"configuration guide": "06_HP_ALM_Configuration_Guide.pdf",
	"hp alm configuration guide": "06_HP_ALM_Configuration_Guide.pdf",
	"iq protocol": "07_IQ_Protocol_and_Report.pdf",
	"iq report": "07_IQ_Protocol_and_Report.pdf",
	"oq protocol": "08_OQ_Protocol_and_Report.pdf",
	"oq report": "08_OQ_Protocol_and_Report.pdf",
	"data migration plan": "09_Data_Migration_Plan.pdf",
	"migration plan": "09_Data_Migration_Plan.pdf",
	"data migration summary": "10_Data_Migration_Summary_Report.pdf",
	"migration summary": "10_Data_Migration_Summary_Report.pdf",
	"pq": "11_PQ_UAT_Protocol_and_Report.pdf",
	"uat": "11_PQ_UAT_Protocol_and_Report.pdf",
	"validation summary report": "12_Validation_Summary_Report.pdf",
	"vsr": "12_Validation_Summary_Report.pdf",
	"traceability matrix": "13_Traceability_Matrix.pdf",
	"rtm": "13_Traceability_Matrix.pdf",
	"change control sop": "14_Change_Control_SOP.pdf",
	"regulatory reference guide": "15_Regulatory_Reference_Guide.pdf",
	}


	@dataclass
	class PageRecord:
	source_file: str
	page_num: int
	text: str
	lines: List[str]


	@dataclass
	class Citation:
	source_file: str
	page_num: int
	line_start: int
	line_end: int
	excerpt: str


	@dataclass
	class SessionData:
	session_id: str
	temp_dir: str
	pdf_path: str
	file_name: str
	file_hash: str
	file_size_bytes: int
	page_records: List[PageRecord]
	chunks: List[Chunk]
	retriever: "SessionRetriever"
	agent: AgenticSelfRAG
	page_count: int
	extractor: str
	structured: Dict[str, dict] = field(default_factory=dict)
	created_at: float = field(default_factory=time.time)
	last_activity: float = field(default_factory=time.time)
	question_timestamps: List[float] = field(default_factory=list)


	@dataclass
	class QuestionPlan:
	mode: str
	expected_type: str
	expanded_query: str
	allow_agentic_fallback: bool = True


	class EmptyRetriever:
	def __init__(self):
	self.chunks: List[Chunk] = []

	def retrieve(self, query: str, k: int = K_PASSAGES) -> List[Chunk]:
	return []


	class SessionRetriever:
	def __init__(self, chunks: List[Chunk], encoder: SentenceTransformer):
	self.chunks = chunks
	self._encoder = encoder
	self.index = None
	self._build_index()

	def _build_index(self):
	if not self.chunks:
	return
	texts = [f"{chunk.source_file} {chunk.text}" for chunk in self.chunks]
	embeddings = self._encoder.encode(
	texts,
	convert_to_numpy=True,
	normalize_embeddings=True,
	show_progress_bar=False,
	).astype("float32")
	self.index = faiss.IndexFlatIP(embeddings.shape[1])
	self.index.add(embeddings)

	def retrieve(self, query: str, k: int = K_PASSAGES) -> List[Chunk]:
	if self.index is None:
	return []
	query_embedding = self._encoder.encode(
	[query],
	convert_to_numpy=True,
	normalize_embeddings=True,
	show_progress_bar=False,
	).astype("float32")
	_, indices = self.index.search(query_embedding, min(k, len(self.chunks)))
	return [self.chunks[i] for i in indices[0] if 0 <= i < len(self.chunks)]


	SESSIONS: Dict[str, SessionData] = {}
	SESSIONS_LOCK = threading.Lock()
	MODEL_LOCK = threading.Lock()
	EMBEDDER_LOCK = threading.Lock()
	GLOBAL_EMBEDDER: Optional[SentenceTransformer] = None
	GLOBAL_AGENT_TEMPLATE: Optional[AgenticSelfRAG] = None


	def ensure_directories():
	UPLOAD_ROOT.mkdir(parents=True, exist_ok=True)
	LOG_ROOT.mkdir(parents=True, exist_ok=True)


	def now_ts() -> float:
	return time.time()


	def normalize_text(text: str) -> str:
	text = text.replace("\u2581", " ").replace("\xa0", " ")
	text = re.sub(r"\s+", " ", text)
	return text.strip()


	def pretty_doc_name(file_name: str) -> str:
	base = file_name.replace(".pdf", "")
	base = re.sub(r"^\d+_", "", base)
	return base.replace("_", " ")


	def content_terms(text: str) -> set:
	stop = {
	"a", "an", "the", "is", "are", "was", "were", "be", "been", "being",
	"do", "does", "did", "have", "has", "had", "how", "what", "when",
	"where", "why", "which", "who", "whom", "this", "that", "these",
	"those", "and", "or", "but", "for", "with", "into", "from", "about",
	"main", "use", "uses", "using", "used", "number", "version", "date",
	"system", "document", "pdf", "page", "line", "file", "does", "give",
	}
	tokens = re.findall(r"[A-Za-z][A-Za-z0-9_-]+", text.lower())
	return {token for token in tokens if token not in stop and len(token) > 2}


	def clip_text(text: str, max_chars: int = 320) -> str:
	text = normalize_text(text)
	if len(text) <= max_chars:
	return text
	clipped = text[:max_chars].rsplit(" ", 1)[0].strip()
	return clipped + "..."


	def question_plan(question: str) -> QuestionPlan:
	q = normalize_text(question).lower()
	expanded = q
	mode = "descriptive"
	expected = "text"
	allow_agentic_fallback = True

	if any(cue in q for cue in ["how to", "how do", "how should", "steps", "process", "procedure", "workflow", "manage ", "handling "]):
	mode = "procedural"
	expected = "procedure"
	allow_agentic_fallback = True
	elif q.startswith("who") or "who is" in q or "who was" in q:
	mode = "factoid"
	expected = "person"
	elif any(cue in q for cue in ["how many", "count", "number of"]):
	mode = "factoid"
	expected = "number"
	elif any(cue in q for cue in ["amount", "paid amount", "price", "cost", "total", "fee"]):
	mode = "factoid"
	expected = "currency"
	elif any(cue in q for cue in ["date", "when", "go-live"]):
	mode = "factoid"
	expected = "date"
	elif "version" in q:
	mode = "factoid"
	expected = "version"
	elif "name of the project" in q or ("project" in q and "name" in q):
	mode = "factoid"
	expected = "project_name"
	elif "name of" in q:
	mode = "factoid"
	expected = "name"
	elif q.startswith("what is") or q.startswith("what was") or q.startswith("what were"):
	mode = "factoid"
	expected = "text"

	if "deviation" in q:
	expanded += " deviation deviations reviewed review closed closure investigated approved documented"
	if "guide" in q:
	expanded += " guide supervisor advisor mentor person name"
	if expected == "currency":
	expanded += " amount paid total INR Rs price payment"
	if expected == "project_name":
	expanded += " project name document project"
	if expected == "procedure":
	expanded += " steps process procedure shall must review close approve"

	return QuestionPlan(
	mode=mode,
	expected_type=expected,
	expanded_query=expanded,
	allow_agentic_fallback=allow_agentic_fallback,
	)


	def matched_source_files(question: str) -> List[str]:
	q = normalize_text(question).lower()
	matches = []
	for hint, file_name in DOC_NAME_HINTS.items():
	if hint in q and file_name not in matches:
	matches.append(file_name)
	return matches


	def evidence_has_expected_type(plan: QuestionPlan, sentences: List[str]) -> bool:
	if not sentences:
	return False
	joined = " ".join(sentences)
	q = plan.expanded_query

	if plan.expected_type == "person":
	if PERSON_RE.search(joined):
	return True
	if any(role in q for role in ROLE_NOUNS):
	return False
	return False
	if plan.expected_type == "currency":
	return bool(CURRENCY_RE.search(joined) or AMOUNT_RE.search(joined))
	if plan.expected_type == "date":
	return bool(DATE_RE.search(joined))
	if plan.expected_type == "version":
	return bool(VERSION_RE.search(joined))
	if plan.expected_type == "number":
	return bool(NUMBER_RE.search(joined))
	if plan.expected_type == "procedure":
	return any(
	token in joined.lower()
	for token in ["must", "shall", "should", "reviewed", "closed", "approved", "documented", "investigated", "process", "procedure", "steps"]
	)
	return True


	def append_jsonl(path: Path, payload: dict):
	ensure_directories()
	with path.open("a", encoding="utf-8") as handle:
	handle.write(json.dumps(payload, ensure_ascii=True) + "\n")


	def log_event(event_type: str, **payload):
	append_jsonl(
	EVENT_LOG_PATH,
	{
	"timestamp": now_ts(),
	"event_type": event_type,
	**payload,
	},
	)


	def sha256_file(file_path: str) -> str:
	digest = hashlib.sha256()
	with open(file_path, "rb") as handle:
	for chunk in iter(lambda: handle.read(1024 * 1024), b""):
	digest.update(chunk)
	return digest.hexdigest()


	def remove_session(session_id: str):
	session = SESSIONS.pop(session_id, None)
	if not session:
	return
	try:
	shutil.rmtree(session.temp_dir, ignore_errors=True)
	except Exception:
	pass


	def cleanup_expired_sessions():
	cutoff = now_ts() - SESSION_TTL_SECONDS
	expired: List[str] = []
	with SESSIONS_LOCK:
	for session_id, session in list(SESSIONS.items()):
	if session_id == "phase2-corpus":
	continue
	if session.last_activity < cutoff:
	expired.append(session_id)
	for session_id in expired:
	remove_session(session_id)
	for session_id in expired:
	log_event("session_expired", session_id=session_id)


	def get_embedder() -> SentenceTransformer:
	global GLOBAL_EMBEDDER
	with EMBEDDER_LOCK:
	if GLOBAL_EMBEDDER is None:
	GLOBAL_EMBEDDER = SentenceTransformer("all-MiniLM-L6-v2", device="cpu")
	return GLOBAL_EMBEDDER


	def get_agent_template() -> AgenticSelfRAG:
	global GLOBAL_AGENT_TEMPLATE
	with MODEL_LOCK:
	if GLOBAL_AGENT_TEMPLATE is None:
	template = AgenticSelfRAG(EmptyRetriever(), load_in_4bit=LOAD_IN_4BIT)
	template.load_model()
	GLOBAL_AGENT_TEMPLATE = template
	return GLOBAL_AGENT_TEMPLATE


	def build_session_agent(retriever: SessionRetriever) -> AgenticSelfRAG:
	template = get_agent_template()
	agent = AgenticSelfRAG(retriever, load_in_4bit=LOAD_IN_4BIT)
	agent.pipeline.gen_model = template.pipeline.gen_model
	agent.pipeline.gen_tokenizer = template.pipeline.gen_tokenizer
	agent.pipeline._loaded = True
	agent.pipeline._repair_vocab = None
	agent._model_loaded = True
	agent.verif_agent = template.verif_agent
	agent.qr_agent.pipeline = agent.pipeline
	agent.corr_agent.pipeline = agent.pipeline
	return agent


	def extract_page_records(pdf_path: str, source_file: str) -> Tuple[List[PageRecord], str]:
	try:
	import fitz

	doc = fitz.open(pdf_path)
	page_records: List[PageRecord] = []
	for index, page in enumerate(doc):
	raw_text = page.get_text("text") or ""
	raw_lines = [normalize_text(line) for line in raw_text.splitlines()]
	lines = [line for line in raw_lines if line]
	text = " ".join(lines).strip()
	if text:
	page_records.append(PageRecord(source_file=source_file, page_num=index + 1, text=text, lines=lines))
	doc.close()
	return page_records, "pymupdf"
	except Exception:
	pass

	try:
	from pypdf import PdfReader

	reader = PdfReader(pdf_path)
	page_records = []
	for index, page in enumerate(reader.pages):
	raw_text = page.extract_text() or ""
	raw_lines = [normalize_text(line) for line in raw_text.splitlines()]
	lines = [line for line in raw_lines if line]
	text = " ".join(lines).strip()
	if text:
	page_records.append(PageRecord(source_file=source_file, page_num=index + 1, text=text, lines=lines))
	return page_records, "pypdf"
	except Exception as exc:
	raise RuntimeError(
	"Could not extract text from a corpus PDF. Please verify the stored project documents are text-based PDFs "
	"instead of scanned image-only files."
	) from exc


	def chunk_page_records(page_records: List[PageRecord], file_name: str) -> List[Chunk]:
	chunks: List[Chunk] = []
	chunk_index = 0
	stride = max(1, CHUNK_WORDS - CHUNK_OVERLAP)

	for page_record in page_records:
	words = page_record.text.split()
	if not words:
	continue
	start = 0
	while start < len(words):
	end = min(start + CHUNK_WORDS, len(words))
	window = words[start:end]
	if len(window) > 20:
	chunks.append(
	Chunk(
	chunk_id=f"{file_name}::p{page_record.page_num}::c{chunk_index}",
	source_file=file_name,
	page_num=page_record.page_num,
	text=" ".join(window),
	char_start=start,
	)
	)
	chunk_index += 1
	if end == len(words):
	break
	start += stride
	return chunks


	def validate_pdf(file_path: str) -> Tuple[bool, str]:
	if not file_path:
	return False, "Please upload a PDF file."
	if not file_path.lower().endswith(".pdf"):
	return False, "Only PDF files are accepted."
	try:
	with open(file_path, "rb") as handle:
	if handle.read(4) != b"%PDF":
	return False, "The uploaded file does not look like a valid PDF."
	except OSError:
	return False, "The uploaded file could not be read."
	size_bytes = os.path.getsize(file_path)
	if size_bytes > MAX_FILE_SIZE_MB * 1024 * 1024:
	return False, f"PDF exceeds the {MAX_FILE_SIZE_MB} MB file-size limit."
	return True, ""


	def make_temp_session_dir() -> str:
	ensure_directories()
	return tempfile.mkdtemp(prefix="session_", dir=str(UPLOAD_ROOT))


	def make_metric_badge(label: str, value: str) -> str:
	return (
	"<div style='padding:10px 12px;border-radius:14px;background:#f8fafc;"
	"border:1px solid #dbe4f0'>"
	f"<div style='font-size:11px;letter-spacing:0.08em;text-transform:uppercase;"
	f"opacity:0.7;margin-bottom:5px'>{html.escape(label)}</div>"
	f"<div style='font-size:16px;font-weight:700;color:#0f172a'>{html.escape(value)}</div>"
	"</div>"
	)


	def result_card(title: str, body: str, tone: str = "normal") -> str:
	palette = {
	"normal": ("#ffffff", "#0f172a", "#dbe4f0"),
	"error": ("#fff4f4", "#9f1239", "#fecdd3"),
	"warn": ("#fff8eb", "#9a3412", "#fed7aa"),
	"success": ("#f0fdf4", "#166534", "#bbf7d0"),
	}
	bg, fg, border = palette[tone]
	return (
	f"<div style='background:{bg};color:{fg};border:1px solid {border};"
	"border-radius:18px;padding:18px 20px;box-shadow:0 10px 30px rgba(15,23,42,0.06)'>"
	f"<div style='font-size:12px;letter-spacing:0.08em;text-transform:uppercase;"
	f"opacity:0.65;margin-bottom:8px'>{html.escape(title)}</div>"
	f"<div style='font-size:16px;line-height:1.65'>{body}</div></div>"
	)


	def initial_result_html() -> str:
	return result_card(
	"Ready",
	"Ask a question about the preloaded Phase 2 project corpus to receive a grounded answer with page and line references.",
	)


	def line_window_score(query: str, answer: str, chunk_text: str, snippet: str) -> float:
	query_terms = content_terms(query)
	answer_terms = content_terms(answer)
	chunk_terms = content_terms(chunk_text)
	snippet_terms = content_terms(snippet)
	exact_bonus = 1.0 if normalize_text(answer).lower() in normalize_text(snippet).lower() and answer.strip() else 0.0
	overlap = len((query_terms \| answer_terms \| chunk_terms) & snippet_terms)
	return exact_bonus + overlap / max(1, len(snippet_terms))


	def locate_citation(question: str, answer: str, chunk: Optional[Chunk], session: SessionData) -> Optional[Citation]:
	if chunk is None:
	return None

	page_record = next(
	(
	record
	for record in session.page_records
	if record.page_num == chunk.page_num and record.source_file == chunk.source_file
	),
	None,
	)
	if page_record is None or not page_record.lines:
	return None

	best_score = -1.0
	best_window = (1, 1, page_record.lines[0])

	for start_index in range(len(page_record.lines)):
	for end_index in range(start_index, min(len(page_record.lines), start_index + 4)):
	excerpt = " ".join(page_record.lines[start_index:end_index + 1]).strip()
	if not excerpt:
	continue
	score = line_window_score(question, answer, chunk.text, excerpt)
	if score > best_score:
	best_score = score
	best_window = (start_index + 1, end_index + 1, excerpt)

	return Citation(
	source_file=session.file_name,
	page_num=chunk.page_num,
	line_start=best_window[0],
	line_end=best_window[1],
	excerpt=best_window[2],
	)


	def locate_citation_by_file_page(question: str, answer: str, session: SessionData, source_file: str, page_num: int) -> Optional[Citation]:
	page_record = next(
	(
	record
	for record in session.page_records
	if record.source_file == source_file and record.page_num == page_num
	),
	None,
	)
	if page_record is None or not page_record.lines:
	return None

	best_score = -1.0
	best_window = (1, 1, page_record.lines[0])
	for start_index in range(len(page_record.lines)):
	for end_index in range(start_index, min(len(page_record.lines), start_index + 4)):
	excerpt = " ".join(page_record.lines[start_index:end_index + 1]).strip()
	if not excerpt:
	continue
	score = line_window_score(question, answer, excerpt, excerpt)
	if score > best_score:
	best_score = score
	best_window = (start_index + 1, end_index + 1, excerpt)
	return Citation(
	source_file=source_file,
	page_num=page_num,
	line_start=best_window[0],
	line_end=best_window[1],
	excerpt=best_window[2],
	)


	def best_evidence_sentences(
	session: SessionData,
	question: str,
	plan: QuestionPlan,
	source_filters: Optional[List[str]] = None,
	) -> Tuple[List[Chunk], List[Tuple[Chunk, str]]]:
	chunks = session.retriever.retrieve(plan.expanded_query, k=min(max(K_PASSAGES * 3, 12), len(session.chunks)))
	if source_filters:
	filtered = [chunk for chunk in chunks if chunk.source_file in source_filters]
	if filtered:
	chunks = filtered
	evidence = session.agent.pipeline.select_evidence(
	plan.expanded_query,
	chunks,
	max_sentences=4 if plan.mode in {"procedural", "descriptive"} else 3,
	)
	return chunks, [(item.chunk, item.sentence) for item in evidence]


	def extract_project_name(sentence: str) -> Optional[str]:
	match = re.search(r"\bProject\s[:\-]?\s([A-Z][A-Za-z0-9 .-]{2,80})", sentence)
	if not match:
	return None
	candidate = match.group(1)
	candidate = re.split(r"\b(?:Status\|System\|Version\|Document\|Approved\|Author)\b", candidate)[0].strip(" :-,")
	return candidate or None


	def concise_factoid_answer(question: str, plan: QuestionPlan, evidence_pairs: List[Tuple[Chunk, str]]) -> Optional[str]:
	if not evidence_pairs:
	return None

	for _, sentence in evidence_pairs:
	if any(token in question.lower() for token in ["interval", "duration", "period"]) and re.search(r"\b\d+\s+(?:months?\|days?\|years?)\b", sentence, re.I):
	match = re.search(r"\b(\d+\s+(?:months?\|days?\|years?))\b", sentence, re.I)
	if match:
	return normalize_text(match.group(1))

	if plan.expected_type == "currency":
	match = AMOUNT_RE.search(sentence)
	if match:
	prefix = (match.group(1) or "INR").replace(".", "")
	return normalize_text(f"{prefix} {match.group(2)}")
	match = CURRENCY_RE.search(sentence)
	if match:
	return normalize_text(match.group(0))

	if plan.expected_type == "date":
	match = DATE_RE.search(sentence)
	if match:
	return normalize_text(match.group(0))

	if plan.expected_type == "version":
	match = VERSION_RE.search(sentence)
	if match:
	return normalize_text(match.group(0))

	if plan.expected_type == "number":
	matches = NUMBER_RE.findall(sentence)
	if matches:
	return normalize_text(matches[0])

	if plan.expected_type == "person":
	match = PERSON_RE.search(sentence)
	if match:
	return normalize_text(match.group(0))

	if plan.expected_type in {"project_name", "name"}:
	candidate = extract_project_name(sentence)
	if candidate:
	return normalize_text(candidate)

	top_sentence = evidence_pairs[0][1]
	if plan.expected_type == "text":
	short = clip_text(top_sentence, max_chars=180)
	if len(short.split()) <= 18:
	return short
	return None


	def metadata_lookup_answer(session: SessionData, question: str, source_filters: List[str]) -> Optional[Tuple[str, str, int]]:
	q = question.lower()
	headers = session.structured.get("headers", {})
	vmp_table = session.structured.get("vmp_table", {})

	if ("author" in q or "approver" in q or "approve" in q or "document id" in q or "version" in q or "approved date" in q) and not source_filters:
	if "author" in q or "approver" in q or "approve" in q:
	return (
	"Please specify which document you mean, for example Validation Master Plan, Configuration Guide, or Traceability Matrix.",
	"",
	0,
	)

	for file_name in source_filters:
	pretty = pretty_doc_name(file_name).lower()
	row = vmp_table.get(pretty)
	header = headers.get(file_name, {})

	if "qa approver" in q and header.get("qa_approver"):
	return header["qa_approver"], file_name, 1
	if "author" in q and header.get("author"):
	return header["author"], file_name, 1
	if ("approve" in q or "approver" in q) and row and row.get("approver"):
	return row["approver"], "02_Validation_Master_Plan.pdf", int(row.get("page_num", "2"))
	if "document id" in q and row and row.get("document_id"):
	return row["document_id"], "02_Validation_Master_Plan.pdf", int(row.get("page_num", "2"))
	if "phase" in q and row and row.get("phase"):
	return row["phase"], "02_Validation_Master_Plan.pdf", int(row.get("page_num", "2"))

	if ("qa approver" in q or "approved date" in q or "version" in q or "system" in q or "status" in q or "project" in q) and header:
	if "approved date" in q and header.get("approved_date"):
	return header["approved_date"], file_name, 1
	if "version" in q and header.get("version"):
	return header["version"], file_name, 1
	if "system" in q and header.get("system"):
	return header["system"], file_name, 1
	if "status" in q and header.get("status"):
	return header["status"], file_name, 1
	if "project" in q and header.get("project"):
	return header["project"], file_name, 1
	return None


	def summarize_procedural_answer(evidence_pairs: List[Tuple[Chunk, str]]) -> Optional[str]:
	if not evidence_pairs:
	return None
	sentences: List[str] = []
	seen = set()
	for _, sentence in evidence_pairs:
	cleaned = clip_text(sentence, max_chars=220)
	key = cleaned.lower()
	if key in seen:
	continue
	sentences.append(cleaned)
	seen.add(key)
	if len(sentences) >= MAX_SUMMARY_SENTENCES:
	break
	if not sentences:
	return None
	return "Based on the document: " + " ".join(sentences)


	def answer_relevance_proxy(question: str, answer: str, citation: Optional[Citation]) -> Optional[float]:
	if not answer or answer.lower().startswith("i don't have enough evidence"):
	return None
	query_terms = content_terms(question)
	support_terms = content_terms(answer)
	if citation:
	support_terms \|= content_terms(citation.excerpt)
	if not query_terms:
	return 1.0
	return round(len(query_terms & support_terms) / len(query_terms), 4)


	def context_precision_proxy(question: str, answer: str, retrieved_chunks: List[Chunk]) -> Optional[float]:
	if not retrieved_chunks:
	return None
	query_terms = content_terms(question)
	answer_terms = content_terms(answer)
	relevant = 0
	for chunk in retrieved_chunks:
	chunk_terms = content_terms(chunk.text)
	if query_terms & chunk_terms or answer_terms & chunk_terms:
	relevant += 1
	return round(relevant / len(retrieved_chunks), 4)


	def faithfulness_proxy(answer: str, citation: Optional[Citation]) -> Optional[float]:
	if not answer or not citation:
	return None
	answer_norm = normalize_text(answer).lower()
	excerpt_norm = normalize_text(citation.excerpt).lower()
	if answer_norm and answer_norm in excerpt_norm:
	return 1.0
	answer_terms = content_terms(answer)
	excerpt_terms = content_terms(citation.excerpt)
	if not answer_terms:
	return 1.0 if answer_norm and answer_norm in excerpt_norm else 0.0
	return round(len(answer_terms & excerpt_terms) / len(answer_terms), 4)


	def citation_html(citation: Optional[Citation]) -> str:
	if citation is None:
	return "No supporting citation was selected."
	line_label = (
	f"line {citation.line_start}"
	if citation.line_start == citation.line_end
	else f"lines {citation.line_start}-{citation.line_end}"
	)
	return (
	f"<div style='font-weight:700;font-size:18px;margin-bottom:8px'>{html.escape(citation.source_file)}</div>"
	f"<div style='font-size:13px;opacity:0.75;margin-bottom:10px'>"
	f"Page {citation.page_num}, {line_label} (extracted text)</div>"
	f"<div>{html.escape(citation.excerpt)}</div>"
	)


	def render_result(
	question: str,
	answer: str,
	citation: Optional[Citation],
	metrics: Dict[str, Optional[float]],
	abstained: bool,
	) -> str:
	answer_card = result_card(
	"Final Answer",
	html.escape(answer),
	tone="warn" if abstained else "normal",
	)
	source_card = result_card(
	"Source Reference",
	citation_html(citation),
	tone="warn" if abstained else "success",
	)

	metric_boxes = "".join(
	[
	make_metric_badge("Latency", f"{metrics['latency_seconds']:.2f}s"),
	make_metric_badge(
	"Answer Relevance",
	"N/A" if metrics.get("answer_relevance") is None else f"{metrics['answer_relevance']:.2f}",
	),
	make_metric_badge(
	"Context Precision",
	"N/A" if metrics.get("context_precision") is None else f"{metrics['context_precision']:.2f}",
	),
	make_metric_badge(
	"Faithfulness",
	"N/A" if metrics.get("faithfulness_proxy") is None else f"{metrics['faithfulness_proxy']:.2f}",
	),
	make_metric_badge("Hallucination", f"{metrics.get('hallucination_rate', 0.0):.2f}"),
	]
	)

	question_html = (
	"<div style='margin-bottom:14px;padding:14px 16px;border-radius:16px;"
	"background:linear-gradient(135deg,#eef4ff 0%,#f8fafc 100%);"
	"border:1px solid #d9e6ff;color:#0f172a'>"
	"<div style='font-size:12px;letter-spacing:0.08em;text-transform:uppercase;"
	"opacity:0.65;margin-bottom:6px'>Question</div>"
	f"<div style='font-size:18px;font-weight:600;line-height:1.5'>{html.escape(question)}</div>"
	"</div>"
	)

	return (
	"<div style='font-family:ui-sans-serif,system-ui,sans-serif'>"
	f"{question_html}"
	"<div style='display:grid;grid-template-columns:1fr;gap:14px'>"
	f"{answer_card}{source_card}"
	f"<div style='display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:10px'>{metric_boxes}</div>"
	"</div></div>"
	)


	def render_document_status(session: SessionData) -> str:
	doc_count = len({record.source_file for record in session.page_records})
	return result_card(
	"Corpus Loaded",
	(
	f"<strong>{html.escape(session.file_name)}</strong><br>"
	f"Documents indexed: {doc_count}<br>"
	f"Pages indexed: {session.page_count}<br>"
	f"Chunks indexed: {len(session.chunks)}<br>"
	f"Extractor used: {html.escape(session.extractor)}<br>"
	f"Knowledge base mode: preloaded project corpus"
	),
	tone="success",
	)


	def error_html(message: str) -> str:
	return result_card("Action Required", html.escape(message), tone="error")


	def info_html(message: str) -> str:
	return result_card("Notice", html.escape(message), tone="warn")


	def build_session(file_path: str) -> SessionData:
	is_valid, validation_message = validate_pdf(file_path)
	if not is_valid:
	raise ValueError(validation_message)

	temp_dir = make_temp_session_dir()
	try:
	file_name = os.path.basename(file_path)
	dest_path = os.path.join(temp_dir, file_name)
	shutil.copy2(file_path, dest_path)

	page_records, extractor_name = extract_page_records(dest_path, file_name)
	if not page_records:
	raise ValueError(
	"No extractable text was found in the PDF. Please upload a text-based PDF."
	)
	if len(page_records) > MAX_PAGES:
	raise ValueError(
	f"PDF has {len(page_records)} pages, which exceeds the {MAX_PAGES}-page limit."
	)

	chunks = chunk_page_records(page_records, file_name)
	if not chunks:
	raise ValueError(
	"The extracted PDF text did not produce enough content to index."
	)
	if len(chunks) > MAX_CHUNKS:
	raise ValueError(
	f"PDF produced {len(chunks)} chunks, which exceeds the {MAX_CHUNKS}-chunk limit."
	)

	file_hash = sha256_file(dest_path)
	file_size_bytes = os.path.getsize(dest_path)
	retriever = SessionRetriever(chunks, get_embedder())
	agent = build_session_agent(retriever)

	return SessionData(
	session_id=str(uuid.uuid4()),
	temp_dir=temp_dir,
	pdf_path=dest_path,
	file_name=file_name,
	file_hash=file_hash,
	file_size_bytes=file_size_bytes,
	page_records=page_records,
	chunks=chunks,
	retriever=retriever,
	agent=agent,
	page_count=len(page_records),
	extractor=extractor_name,
	structured={},
	)
	except Exception:
	shutil.rmtree(temp_dir, ignore_errors=True)
	raise


	def parse_header_metadata(page_records: List[PageRecord]) -> Dict[str, str]:
	if not page_records:
	return {}
	lines = page_records[0].lines
	metadata: Dict[str, str] = {}
	title_lines: List[str] = []
	i = 0
	while i < len(lines):
	line = lines[i]
	if re.match(r"^\d+\.", line):
	break
	if line.endswith(":") and i + 1 < len(lines):
	key = line[:-1].strip().lower().replace(" ", "_")
	metadata[key] = lines[i + 1].strip()
	i += 2
	continue
	title_lines.append(line)
	i += 1
	metadata["header_text"] = " ".join(lines[: min(len(lines), 20)])
	metadata["title"] = " \| ".join(title_lines[:3])
	return metadata


	def parse_vmp_table(page_records: List[PageRecord]) -> Dict[str, Dict[str, str]]:
	known_docs = {
	"Project Charter",
	"Validation Master Plan",
	"User Requirements Specification",
	"Functional Requirements Specification",
	"Risk Assessment",
	"HP ALM Configuration Guide",
	"IQ Protocol",
	"IQ Execution Report",
	"OQ Protocol",
	"OQ Execution Report",
	"PQ/UAT Protocol and Report",
	"Data Migration Plan",
	"Data Migration Summary Report",
	"Validation Summary Report",
	"Traceability Matrix",
	"Change Control SOP",
	}
	rows: Dict[str, Dict[str, str]] = {}
	all_lines: List[Tuple[int, str]] = []
	for record in page_records:
	for line in record.lines:
	all_lines.append((record.page_num, line))

	collecting = False
	idx = 0
	while idx < len(all_lines):
	page_num, line = all_lines[idx]
	if line == "Document":
	collecting = True
	idx += 5
	continue
	if not collecting:
	idx += 1
	continue
	if line.startswith("4. Roles and Responsibilities"):
	break
	if line in known_docs and idx + 4 < len(all_lines):
	_, doc_id = all_lines[idx + 1]
	_, phase = all_lines[idx + 2]
	_, author = all_lines[idx + 3]
	_, approver = all_lines[idx + 4]
	rows[line.lower()] = {
	"document": line,
	"document_id": doc_id,
	"phase": phase,
	"author": author,
	"approver": approver,
	"page_num": str(page_num),
	}
	idx += 5
	continue
	idx += 1
	if not rows:
	log_event(
	"vmp_table_parse_empty",
	source_file="02_Validation_Master_Plan.pdf",
	page_count=len(page_records),
	)
	return rows


	def corpus_pdf_files(candidate: Path) -> List[Path]:
	if not candidate.exists() or not candidate.is_dir():
	return []
	pdfs = sorted(p for p in candidate.glob("*.pdf") if p.is_file())
	numbered = [p for p in pdfs if re.match(r"^\d{2}_.+\.pdf$", p.name)]
	required = {
	"01_Project_Charter.pdf",
	"02_Validation_Master_Plan.pdf",
	"15_Regulatory_Reference_Guide.pdf",
	}
	names = {p.name for p in numbered}
	if required.issubset(names):
	return numbered
	if len(numbered) >= 10:
	return numbered
	return []


	def resolve_corpus_dir() -> Path:
	for candidate in CORPUS_CANDIDATES:
	if corpus_pdf_files(candidate):
	return candidate
	raise FileNotFoundError(
	"Phase 2 corpus not found. Upload the 15 PDF files either into a phase2_corpus folder in the app repo or at the repo root."
	)


	def build_corpus_session() -> SessionData:
	corpus_dir = resolve_corpus_dir()
	pdf_paths = corpus_pdf_files(corpus_dir)
	page_records: List[PageRecord] = []
	chunks: List[Chunk] = []
	structured: Dict[str, dict] = {"headers": {}, "vmp_table": {}, "corpus_dir": str(corpus_dir)}
	extractors = set()
	file_hash_parts: List[str] = []

	for pdf_path in pdf_paths:
	file_name = pdf_path.name
	doc_pages, extractor_name = extract_page_records(str(pdf_path), file_name)
	extractors.add(extractor_name)
	page_records.extend(doc_pages)
	chunks.extend(chunk_page_records(doc_pages, file_name))
	structured["headers"][file_name] = parse_header_metadata(doc_pages)
	file_hash_parts.append(f"{file_name}:{pdf_path.stat().st_size}:{int(pdf_path.stat().st_mtime)}")
	if file_name == "02_Validation_Master_Plan.pdf":
	structured["vmp_table"] = parse_vmp_table(doc_pages)

	retriever = SessionRetriever(chunks, get_embedder())
	agent = build_session_agent(retriever)
	corpus_hash = hashlib.sha256("\|".join(file_hash_parts).encode("utf-8")).hexdigest()
	return SessionData(
	session_id="phase2-corpus",
	temp_dir="",
	pdf_path=str(corpus_dir),
	file_name="Phase 2 corpus",
	file_hash=corpus_hash,
	file_size_bytes=0,
	page_records=page_records,
	chunks=chunks,
	retriever=retriever,
	agent=agent,
	page_count=len({(record.source_file, record.page_num) for record in page_records}),
	extractor=" / ".join(sorted(extractors)),
	structured=structured,
	)


	def handle_upload(file_obj, current_session_id: Optional[str]):
	cleanup_expired_sessions()
	if current_session_id:
	with SESSIONS_LOCK:
	remove_session(current_session_id)

	file_path = file_obj if isinstance(file_obj, str) else getattr(file_obj, "name", None)
	if not file_path:
	return None, info_html("Upload one PDF file to start a testing session."), initial_result_html(), None, ""

	try:
	session = build_session(file_path)
	with SESSIONS_LOCK:
	SESSIONS[session.session_id] = session
	log_event(
	"upload_success",
	session_id=session.session_id,
	file_hash=session.file_hash,
	file_name=session.file_name,
	page_count=session.page_count,
	chunk_count=len(session.chunks),
	file_size_bytes=session.file_size_bytes,
	extractor=session.extractor,
	)
	return session.session_id, render_document_status(session), initial_result_html(), None, ""
	except Exception as exc:
	log_event("upload_rejected", reason=str(exc), file_name=os.path.basename(file_path))
	return None, error_html(str(exc)), initial_result_html(), None, ""


	def get_session(session_id: Optional[str]) -> Optional[SessionData]:
	if not session_id:
	return None
	with SESSIONS_LOCK:
	session = SESSIONS.get(session_id)
	return session


	def check_rate_limit(session: SessionData) -> Optional[str]:
	now = now_ts()
	window_start = now - 60
	session.question_timestamps = [ts for ts in session.question_timestamps if ts >= window_start]
	if len(session.question_timestamps) >= MAX_QUESTIONS_PER_MINUTE:
	return (
	f"Rate limit reached. Please wait before asking more than "
	f"{MAX_QUESTIONS_PER_MINUTE} questions per minute in one session."
	)
	session.question_timestamps.append(now)
	return None


	def build_question_metrics(question: str, answer: str, citation: Optional[Citation], retrieved_chunks: List[Chunk], hallucination_rate: float, latency_seconds: float) -> Dict[str, Optional[float]]:
	return {
	"latency_seconds": latency_seconds,
	"answer_relevance": answer_relevance_proxy(question, answer, citation),
	"context_precision": context_precision_proxy(question, answer, retrieved_chunks),
	"faithfulness_proxy": faithfulness_proxy(answer, citation),
	"hallucination_rate": hallucination_rate,
	}


	def ask_question(question: str, session_id: Optional[str]):
	cleanup_expired_sessions()
	question = (question or "").strip()
	if not question:
	return info_html("Enter a question to query the preloaded project corpus."), None, ""

	session = get_session(session_id)
	if session is None:
	return error_html("The preloaded corpus is not available right now. Please reload the app."), None, ""

	session.last_activity = now_ts()
	rate_limit_message = check_rate_limit(session)
	if rate_limit_message:
	log_event("rate_limited", session_id=session.session_id, question=question)
	return error_html(rate_limit_message), None, ""

	plan = question_plan(question)
	source_filters = matched_source_files(question)
	metadata_hit = metadata_lookup_answer(session, question, source_filters)
	if metadata_hit:
	answer_text, source_file, source_page = metadata_hit
	abstained = False
	hallucination_rate = 0.0
	latency_seconds = 0.0
	citation = (
	locate_citation_by_file_page(question, answer_text, session, source_file, source_page)
	if source_file
	else None
	)
	metrics = build_question_metrics(question, answer_text, citation, [], hallucination_rate, latency_seconds)
	result_html = render_result(question, answer_text, citation, metrics, abstained=False)
	response_state = {
	"session_id": session.session_id,
	"question": question,
	"answer": answer_text,
	"abstained": False,
	"source_file": citation.source_file if citation else None,
	"page_num": citation.page_num if citation else None,
	"line_start": citation.line_start if citation else None,
	"line_end": citation.line_end if citation else None,
	"excerpt": citation.excerpt if citation else None,
	"metrics": metrics,
	"file_hash": session.file_hash,
	"route_mode": "structured",
	"expected_type": plan.expected_type,
	}
	append_jsonl(
	INTERACTION_LOG_PATH,
	{
	"timestamp": now_ts(),
	"session_id": session.session_id,
	"file_hash": session.file_hash,
	"file_name": session.file_name,
	"page_count": session.page_count,
	"question": question,
	"answer": answer_text,
	"abstained": False,
	"source_file": citation.source_file if citation else None,
	"page_num": citation.page_num if citation else None,
	"line_start": citation.line_start if citation else None,
	"line_end": citation.line_end if citation else None,
	"excerpt": citation.excerpt if citation else None,
	"latency_seconds": latency_seconds,
	"route_mode": "structured",
	"expected_type": plan.expected_type,
	"answer_relevance": metrics["answer_relevance"],
	"context_precision": metrics["context_precision"],
	"faithfulness_proxy": metrics["faithfulness_proxy"],
	"hallucination_rate": metrics["hallucination_rate"],
	},
	)
	return result_html, response_state, ""

	retrieved_chunks, evidence_pairs = best_evidence_sentences(session, question, plan, source_filters=source_filters)
	evidence_sentences = [sentence for _, sentence in evidence_pairs]

	start = time.perf_counter()
	output = None
	best_chunk = evidence_pairs[0][0] if evidence_pairs else None
	answer_text: str
	abstained = False
	hallucination_rate = 0.0

	def run_agentic_fallback() -> Tuple[str, bool, float, Optional[Chunk], Optional[str]]:
	nonlocal output, best_chunk
	try:
	output = session.agent.run(question)
	best_chunk = output.best_chunk or best_chunk
	answer = (
	"I don't have enough evidence in the project corpus to answer that reliably."
	if output.abstained
	else (output.answer or "No answer produced.")
	)
	return answer, output.abstained, output.hallucination_rate or 0.0, best_chunk, None
	except Exception as exc:
	log_event("inference_failed", session_id=session.session_id, question=question, error=str(exc))
	return "", False, 0.0, best_chunk, str(exc)

	if not evidence_pairs:
	abstained = True
	answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
	elif plan.mode in {"procedural", "descriptive"}:
	summary_answer = summarize_procedural_answer(evidence_pairs) if evidence_has_expected_type(plan, evidence_sentences) else None
	if summary_answer:
	answer_text = summary_answer
	abstained = False
	else:
	if plan.allow_agentic_fallback:
	answer_text, abstained, hallucination_rate, best_chunk, inference_error = run_agentic_fallback()
	if inference_error:
	return error_html(f"Inference failed: {inference_error}"), None, ""
	else:
	abstained = True
	answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
	else:
	concise_answer = concise_factoid_answer(question, plan, evidence_pairs)
	if concise_answer and evidence_has_expected_type(plan, evidence_sentences):
	answer_text = concise_answer
	elif not evidence_has_expected_type(plan, evidence_sentences):
	abstained = True
	answer_text = "I don't have enough evidence in the project corpus to answer that reliably."
	elif plan.allow_agentic_fallback:
	answer_text, abstained, hallucination_rate, best_chunk, inference_error = run_agentic_fallback()
	if inference_error:
	return error_html(f"Inference failed: {inference_error}"), None, ""
	else:
	abstained = True
	answer_text = "I don't have enough evidence in the project corpus to answer that reliably."

	latency_seconds = time.perf_counter() - start
	citation = locate_citation(question, answer_text, best_chunk, session)
	metrics = build_question_metrics(
	question,
	answer_text,
	citation,
	retrieved_chunks,
	hallucination_rate,
	latency_seconds,
	)

	result_html = render_result(
	question=question,
	answer=answer_text,
	citation=citation,
	metrics=metrics,
	abstained=abstained,
	)

	response_state = {
	"session_id": session.session_id,
	"question": question,
	"answer": answer_text,
	"abstained": abstained,
	"source_file": citation.source_file if citation else None,
	"page_num": citation.page_num if citation else None,
	"line_start": citation.line_start if citation else None,
	"line_end": citation.line_end if citation else None,
	"excerpt": citation.excerpt if citation else None,
	"metrics": metrics,
	"file_hash": session.file_hash,
	"route_mode": plan.mode,
	"expected_type": plan.expected_type,
	}

	append_jsonl(
	INTERACTION_LOG_PATH,
	{
	"timestamp": now_ts(),
	"session_id": session.session_id,
	"file_hash": session.file_hash,
	"file_name": session.file_name,
	"page_count": session.page_count,
	"question": question,
	"answer": answer_text,
	"abstained": abstained,
	"source_file": citation.source_file if citation else None,
	"page_num": citation.page_num if citation else None,
	"line_start": citation.line_start if citation else None,
	"line_end": citation.line_end if citation else None,
	"excerpt": citation.excerpt if citation else None,
	"latency_seconds": latency_seconds,
	"route_mode": plan.mode,
	"expected_type": plan.expected_type,
	"answer_relevance": metrics["answer_relevance"],
	"context_precision": metrics["context_precision"],
	"faithfulness_proxy": metrics["faithfulness_proxy"],
	"hallucination_rate": metrics["hallucination_rate"],
	},
	)

	return result_html, response_state, ""


	def submit_feedback(response_state: Optional[dict], vote: str):
	if not response_state:
	return "Ask a question first, then rate the answer."
	log_event(
	"feedback",
	session_id=response_state.get("session_id"),
	file_hash=response_state.get("file_hash"),
	question=response_state.get("question"),
	vote=vote,
	source_file=response_state.get("source_file"),
	page_num=response_state.get("page_num"),
	line_start=response_state.get("line_start"),
	line_end=response_state.get("line_end"),
	)
	if vote == "helpful":
	return "Thanks. Your feedback was recorded as helpful."
	return "Thanks. Your feedback was recorded for review."


	ensure_directories()
	try:
	PRELOADED_SESSION = build_corpus_session()
	with SESSIONS_LOCK:
	SESSIONS[PRELOADED_SESSION.session_id] = PRELOADED_SESSION
	PRELOADED_STATUS_HTML = render_document_status(PRELOADED_SESSION)
	STARTUP_NOTICE = ""
	except Exception as exc:
	PRELOADED_SESSION = None
	PRELOADED_STATUS_HTML = error_html(str(exc))
	STARTUP_NOTICE = str(exc)


	with gr.Blocks(css=CSS) as demo:
	session_state = gr.State(PRELOADED_SESSION.session_id if PRELOADED_SESSION else None)
	response_state = gr.State(None)

	gr.Markdown(
	f"""
	# {APP_NAME}
	{APP_TAGLINE}

	Project corpus mode
	- Preloaded Phase 2 project documents
	- Optimized for this fixed validation corpus
	- Best for onboarding, project lookup, and validation Q&A
	- Rate limit: {MAX_QUESTIONS_PER_MINUTE} questions per minute
	"""
	)

	gr.Markdown(
	f"""
	<div style="padding:14px 16px;border-radius:14px;background:#fff8eb;border:1px solid #fed7aa;color:#9a3412">
	<strong>Privacy notice</strong><br>
	{html.escape(PRIVACY_NOTICE)}
	</div>
	"""
	)

	if STARTUP_NOTICE:
	gr.HTML(error_html(STARTUP_NOTICE))

	document_status = gr.HTML(PRELOADED_STATUS_HTML)

	with gr.Row():
	question_box = gr.Textbox(
	label="Your Question",
	lines=2,
	placeholder="Ask a question about the preloaded project documents...",
	scale=5,
	)
	ask_btn = gr.Button("Ask", elem_id="ask_btn", scale=1)

	result_html_component = gr.HTML(initial_result_html())

	with gr.Row():
	helpful_btn = gr.Button("Helpful")
	needs_work_btn = gr.Button("Needs Improvement")

	feedback_status = gr.Textbox(
	label="Feedback Status",
	value="",
	interactive=False,
	)

	with gr.Accordion("Testing guardrails and privacy details", open=False):
	gr.Markdown(
	f"""
	- Knowledge base: 15 preloaded Phase 2 project PDFs
	- Citation format: page and extracted-text line range
	- Queueing: concurrency limit {QUEUE_CONCURRENCY}, queue size {QUEUE_MAX_SIZE}
	- Logged for evaluation: corpus question, answer, citation, latency, and proxy metrics
	"""
	)

	ask_btn.click(
	ask_question,
	inputs=[question_box, session_state],
	outputs=[result_html_component, response_state, feedback_status],
	)
	question_box.submit(
	ask_question,
	inputs=[question_box, session_state],
	outputs=[result_html_component, response_state, feedback_status],
	)

	helpful_btn.click(
	lambda state: submit_feedback(state, "helpful"),
	inputs=[response_state],
	outputs=[feedback_status],
	)
	needs_work_btn.click(
	lambda state: submit_feedback(state, "needs_improvement"),
	inputs=[response_state],
	outputs=[feedback_status],
	)


	if __name__ == "__main__":
	ensure_directories()
	cleanup_expired_sessions()
	demo.queue(default_concurrency_limit=QUEUE_CONCURRENCY, max_size=QUEUE_MAX_SIZE).launch()