Spaces:

pro580
/

customer-support-agent

Running

App Files Files Community

customer-support-agent / scripts /generate_interview_pdf.py

pro580

Fix rate limiter to use X-Forwarded-For header behind HF proxy

e323466 2 months ago

Raw

History Blame Contribute Delete

68.1 kB

	"""Generate a recruiter interview Q&A PDF for the intent classifier project.

	Covers every likely question a recruiter or technical interviewer would ask,
	with clear, simple answers explained as if to a 7-year-old — no jargon left
	unexplained.
	"""

	import sys
	from pathlib import Path

	sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

	from reportlab.lib.pagesizes import A4
	from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
	from reportlab.lib.units import cm
	from reportlab.lib import colors
	from reportlab.platypus import (
	SimpleDocTemplate, Paragraph, Spacer, PageBreak,
	Table, TableStyle, HRFlowable, KeepTogether
	)
	from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY

	OUTPUT = Path("results/interview_prep.pdf")
	OUTPUT.parent.mkdir(exist_ok=True)

	# ---------------------------------------------------------------------------
	# Styles
	# ---------------------------------------------------------------------------
	styles = getSampleStyleSheet()

	TITLE_STYLE = ParagraphStyle(
	"ITitle", parent=styles["Title"],
	fontSize=30, textColor=colors.HexColor("#0f3460"),
	spaceAfter=12, alignment=TA_CENTER, fontName="Helvetica-Bold"
	)
	SUBTITLE_STYLE = ParagraphStyle(
	"ISubtitle", parent=styles["Normal"],
	fontSize=13, textColor=colors.HexColor("#533483"),
	spaceAfter=8, alignment=TA_CENTER
	)
	COVER_BODY = ParagraphStyle(
	"ICoverBody", parent=styles["Normal"],
	fontSize=11, leading=17, textColor=colors.HexColor("#1a1a2e"),
	alignment=TA_CENTER, spaceAfter=8
	)
	SECTION_STYLE = ParagraphStyle(
	"ISection", parent=styles["Heading1"],
	fontSize=18, textColor=colors.white,
	spaceBefore=16, spaceAfter=8,
	backColor=colors.HexColor("#0f3460"),
	borderPad=8, fontName="Helvetica-Bold"
	)
	CATEGORY_STYLE = ParagraphStyle(
	"ICategory", parent=styles["Heading2"],
	fontSize=13, textColor=colors.HexColor("#533483"),
	spaceBefore=14, spaceAfter=4, fontName="Helvetica-Bold"
	)
	Q_STYLE = ParagraphStyle(
	"IQuestion", parent=styles["Normal"],
	fontSize=11, leading=16, textColor=colors.HexColor("#0f3460"),
	spaceBefore=10, spaceAfter=3, fontName="Helvetica-Bold",
	backColor=colors.HexColor("#e8f4fd"),
	borderColor=colors.HexColor("#0f3460"),
	borderWidth=1, borderPad=7, borderRadius=4,
	leftIndent=0
	)
	A_STYLE = ParagraphStyle(
	"IAnswer", parent=styles["Normal"],
	fontSize=10, leading=16, textColor=colors.HexColor("#1a1a1a"),
	spaceBefore=4, spaceAfter=4, alignment=TA_JUSTIFY,
	leftIndent=8
	)
	SIMPLE_STYLE = ParagraphStyle(
	"ISimple", parent=styles["Normal"],
	fontSize=10, leading=15, textColor=colors.HexColor("#065f46"),
	spaceBefore=4, spaceAfter=6,
	backColor=colors.HexColor("#ecfdf5"),
	borderColor=colors.HexColor("#6ee7b7"),
	borderWidth=1, borderPad=6, borderRadius=3,
	leftIndent=8
	)
	TIP_STYLE = ParagraphStyle(
	"ITip", parent=styles["Normal"],
	fontSize=9.5, leading=14, textColor=colors.HexColor("#92400e"),
	spaceBefore=3, spaceAfter=6,
	backColor=colors.HexColor("#fffbeb"),
	borderColor=colors.HexColor("#fcd34d"),
	borderWidth=1, borderPad=5,
	leftIndent=8
	)
	BULLET_STYLE = ParagraphStyle(
	"IBullet", parent=styles["Normal"],
	fontSize=10, leading=15, textColor=colors.HexColor("#1a1a1a"),
	leftIndent=20, spaceAfter=3,
	bulletIndent=10
	)
	BODY_STYLE = ParagraphStyle(
	"IBody", parent=styles["Normal"],
	fontSize=10, leading=15, textColor=colors.HexColor("#374151"),
	spaceAfter=5, alignment=TA_JUSTIFY
	)

	# ---------------------------------------------------------------------------
	# Helpers
	# ---------------------------------------------------------------------------
	def sec(title):
	return [Spacer(1, 10), Paragraph(f" {title}", SECTION_STYLE), Spacer(1, 6)]

	def cat(title):
	if isinstance(title, list):
	title = title[0]
	return [Paragraph(title, CATEGORY_STYLE), HRFlowable(width="100%", thickness=0.8,
	color=colors.HexColor("#533483"), spaceAfter=4)]

	def q(text):
	return Paragraph(f"Q: {text}", Q_STYLE)

	def a(text):
	return Paragraph(text, A_STYLE)

	def simple(text):
	return Paragraph(f" Simple version: {text}", SIMPLE_STYLE)

	def tip(text):
	return Paragraph(f" Interview Tip: {text}", TIP_STYLE)

	def bul(text):
	return Paragraph(f" - {text}", BULLET_STYLE)

	def body(text):
	return Paragraph(text, BODY_STYLE)

	def sp(n=8):
	return Spacer(1, n)

	def rule():
	return HRFlowable(width="100%", thickness=0.4, color=colors.HexColor("#e5e7eb"), spaceAfter=6)

	def qa_block(question, answer_text, simple_text="", tip_text="", bullets=None):
	"""One complete Q&A block with optional simple version, tip, and bullets."""
	items = [sp(4), q(question), sp(3), a(answer_text)]
	if bullets:
	for b in bullets:
	items.append(bul(b))
	if simple_text:
	items.append(sp(2))
	items.append(simple(simple_text))
	if tip_text:
	items.append(sp(2))
	items.append(tip(tip_text))
	items.append(sp(4))
	items.append(rule())
	return items

	# ---------------------------------------------------------------------------
	# Build story
	# ---------------------------------------------------------------------------
	story = []

	# ===== COVER PAGE =====
	story += [
	sp(50),
	Paragraph("Interview Prep Guide", TITLE_STYLE),
	Paragraph("Customer Support AI — Intent Classifier Project", SUBTITLE_STYLE),
	sp(16),
	Paragraph(
	"This guide prepares you to answer any question a recruiter or technical interviewer "
	"might ask about your Customer Support AI project.",
	COVER_BODY
	),
	sp(8),
	Paragraph(
	"Every answer is written twice: once in proper technical language, and once in "
	"super-simple language — the way you would explain it to a 7-year-old. "
	"Reading both will make the concept stick.",
	COVER_BODY
	),
	sp(20),
	]

	# Summary box
	cover_table = Table(
	[[
	Paragraph("30\nQuestions\nCovered", ParagraphStyle("ct", fontSize=13, alignment=TA_CENTER,
	textColor=colors.white, fontName="Helvetica-Bold", leading=18)),
	Paragraph("5\nDifficulty\nLevels", ParagraphStyle("ct2", fontSize=13, alignment=TA_CENTER,
	textColor=colors.white, fontName="Helvetica-Bold", leading=18)),
	Paragraph("Simple\nExplanation\nEvery Time", ParagraphStyle("ct3", fontSize=13,
	alignment=TA_CENTER, textColor=colors.white, fontName="Helvetica-Bold", leading=18)),
	]],
	colWidths=[5cm, 5cm, 5*cm]
	)
	cover_table.setStyle(TableStyle([
	("BACKGROUND", (0, 0), (0, 0), colors.HexColor("#0f3460")),
	("BACKGROUND", (1, 0), (1, 0), colors.HexColor("#533483")),
	("BACKGROUND", (2, 0), (2, 0), colors.HexColor("#2d6a4f")),
	("ALIGN", (0, 0), (-1, -1), "CENTER"),
	("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
	("ROWBACKGROUNDS", (0, 0), (-1, -1), [None]),
	("BOX", (0, 0), (-1, -1), 1, colors.white),
	("INNERGRID", (0, 0), (-1, -1), 1, colors.white),
	("TOPPADDING", (0, 0), (-1, -1), 14),
	("BOTTOMPADDING", (0, 0), (-1, -1), 14),
	]))
	story.append(cover_table)
	story.append(PageBreak())

	# ===== TABLE OF CONTENTS =====
	story += sec("Table of Contents")
	toc_data = [
	["Section", "Topic", "Page"],
	["1", "The Big Picture — What Did You Build?", "3"],
	["2", "The Data — Where Did It Come From?", "5"],
	["3", "The Models — How Did You Train Them?", "7"],
	["4", "The Pipeline — How Does It All Connect?", "11"],
	["5", "Evaluation — How Do You Know It Works?", "13"],
	["6", "Challenges & Problem Solving", "16"],
	["7", "Production & Real-World Thinking", "18"],
	["8", "Behavioural Questions", "21"],
	["9", "Rapid-Fire Questions (Short Answers)", "23"],
	["10", "Questions YOU Should Ask the Interviewer", "25"],
	]
	toc = Table(toc_data, colWidths=[1.5cm, 12cm, 2.5*cm])
	toc.setStyle(TableStyle([
	("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#0f3460")),
	("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
	("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
	("FONTSIZE", (0, 0), (-1, 0), 10),
	("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#f8f9fa"), colors.white]),
	("FONTSIZE", (0, 1), (-1, -1), 10),
	("ALIGN", (0, 0), (0, -1), "CENTER"),
	("ALIGN", (2, 0), (2, -1), "CENTER"),
	("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
	("GRID", (0, 0), (-1, -1), 0.5, colors.HexColor("#dee2e6")),
	("TOPPADDING", (0, 0), (-1, -1), 7),
	("BOTTOMPADDING", (0, 0), (-1, -1), 7),
	("LEFTPADDING", (0, 0), (-1, -1), 8),
	]))
	story += [toc, PageBreak()]

	# ===========================================================================
	# SECTION 1 — THE BIG PICTURE
	# ===========================================================================
	story += sec("Section 1: The Big Picture — What Did You Build?")

	story += cat(["Overview Questions"])

	story += qa_block(
	question="Can you give me a 60-second summary of this project?",
	answer_text=(
	"I built a two-stage automated customer support system. In stage one, a fine-tuned "
	"DistilBERT model reads an incoming customer message and classifies it into one of six "
	"intent categories — things like billing issues, account access problems, or cancellation "
	"requests. In stage two, the predicted intent is passed as context to Claude (an Anthropic "
	"LLM), which then generates a helpful, human-sounding support response tailored to that "
	"specific intent. The system also flags low-confidence predictions for human review. I "
	"evaluated the full pipeline using a custom LLM-based scoring framework for faithfulness "
	"and answer relevancy, achieving 0.837 answer relevancy on 50 test queries."
	),
	simple_text=(
	"Imagine a robot postbox at a company. When a customer sends a message, the robot reads "
	"it and puts it in one of six boxes — like 'money problems' or 'can't log in'. Then a "
	"second, smarter robot writes a kind reply based on which box it went into. I built both "
	"robots and tested how well they work."
	),
	tip_text="Always open with: what it does, how it works, and one key result. This answer does all three."
	)

	story += qa_block(
	question="Why did you choose this project?",
	answer_text=(
	"Customer support automation is a genuine industry problem — companies spend billions "
	"on support operations and response quality is inconsistent. This project let me practice "
	"the full ML lifecycle in one place: data engineering, fine-tuning a transformer model, "
	"prompt engineering with a production LLM, evaluation framework design, and packaging "
	"everything into a reproducible pipeline. It also demonstrates that I understand both "
	"classical NLP (TF-IDF baseline) and modern deep learning approaches."
	),
	simple_text=(
	"Customer support is expensive and slow. I wanted to build something that actually "
	"saves a company time and money. And it let me practice every important skill in one "
	"single project — like training for a sports competition by doing every exercise at once."
	),
	tip_text="Show that you understood the business problem, not just the tech. Recruiters love this."
	)

	story += qa_block(
	question="What are the two stages of the pipeline?",
	answer_text=(
	"Stage 1 is the Intent Classifier: a DistilBERT transformer model fine-tuned on labelled "
	"customer support examples. It reads the raw customer query and outputs a predicted intent "
	"label plus a confidence score. Stage 2 is the Response Generator: an Anthropic Claude "
	"model that receives the original query plus a structured prompt template filled with "
	"intent-specific guidance, and produces a personalised support response. The two stages "
	"are chained in the SupportAgent class."
	),
	simple_text=(
	"Stage 1 is the SORTING robot — it reads the message and decides what kind of problem "
	"it is. Stage 2 is the WRITING robot — it reads the sorted message and writes a nice "
	"reply. They work together like a post office and a letter writer."
	),
	tip_text="Draw this on a whiteboard if you get the chance. Diagrams make answers memorable."
	)

	story += qa_block(
	question="What are the 6 intent categories and how did you choose them?",
	answer_text=(
	"The six categories are: billing_issue (charges, refunds, payment problems), "
	"account_access (login, password, account management), technical_support (product "
	"or service problems, delivery), product_inquiry (information, compatibility, "
	"warranty), cancellation_request (cancelling orders or subscriptions), and "
	"general_feedback (complaints, suggestions, general questions). I derived these "
	"by analysing the Bitext customer support dataset's 50+ granular intent tags and "
	"grouping them into business-meaningful categories that a real support department "
	"would use to route tickets."
	),
	simple_text=(
	"Think of it like sorting your toys into boxes: money box, login box, broken-thing box, "
	"asking-questions box, I-want-to-quit box, and other box. These six boxes cover almost "
	"everything a customer could ever message about."
	),
	tip_text="Mention that the categories were business-driven, not just technically convenient. This shows maturity."
	)

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 2 — THE DATA
	# ===========================================================================
	story += sec("Section 2: The Data — Where Did It Come From?")

	story += cat(["Dataset Questions"])

	story += qa_block(
	question="What dataset did you use and why?",
	answer_text=(
	"I used the Bitext Customer Support LLM Chatbot Training Dataset from HuggingFace, "
	"which contains 26,872 labelled customer support utterances across 50+ fine-grained "
	"intent categories. I chose it because it is publicly available, professionally "
	"labelled, representative of real support language, and large enough to fine-tune "
	"a transformer model reliably. It also covers a wide vocabulary of customer phrasings "
	"for the same intent, which helps the model generalise."
	),
	simple_text=(
	"I found a big collection of 26,872 real customer messages on the internet. Each "
	"message already had a label saying what the customer wanted. It's like having a "
	"giant homework sheet that already has all the answers marked — perfect for teaching "
	"the robot."
	),
	tip_text="Always know your dataset size, source, and why it was appropriate. These are standard first questions."
	)

	story += qa_block(
	question="How did you preprocess the data?",
	answer_text=(
	"Preprocessing involved three steps: (1) Text cleaning — converting text to lowercase, "
	"stripping non-ASCII characters, and normalising whitespace using regex. This reduces "
	"vocabulary noise without removing meaningful content. (2) Label mapping — the Bitext "
	"dataset has 50+ granular tags which I mapped to my 6 business categories using a "
	"keyword-based dictionary (LABEL_MAP). Labels that didn't match a keyword got assigned "
	"via a fallback heuristic. (3) Stratified splitting — I split the data 70/15/15 into "
	"train/validation/test sets using sklearn's train_test_split with stratify=label, "
	"ensuring all 6 classes are proportionally represented in every split."
	),
	simple_text=(
	"I cleaned the messages (made everything lowercase, removed weird characters), "
	"then sorted the 50+ original label types into my 6 big categories, "
	"and finally split the data into three piles: a teaching pile, a practice pile, "
	"and a final exam pile."
	),
	tip_text="Stratified splitting is an important detail that shows you understand class imbalance. Mention it confidently."
	)

	story += qa_block(
	question="What is stratified splitting and why does it matter?",
	answer_text=(
	"Stratified splitting means that when you divide your data into train, validation, "
	"and test sets, you ensure each set contains the same proportion of each class label "
	"as the original dataset. Without this, you might accidentally put all examples of a "
	"rare class into the training set and have none in the test set, making evaluation "
	"meaningless. sklearn's train_test_split with stratify=y handles this automatically."
	),
	simple_text=(
	"Imagine you have 10 red balls and 90 blue balls. Stratified splitting means that "
	"no matter which pile you make, each pile has roughly 10% red and 90% blue. "
	"If you did it randomly, you might get a pile that's 100% blue and never test "
	"if the robot can recognise red ones."
	),
	tip_text="This is a classic interview topic. Knowing why it matters (not just what it is) impresses interviewers."
	)

	story += qa_block(
	question="You mapped 50+ labels to 6. How did you handle ambiguous labels?",
	answer_text=(
	"I built a LABEL_MAP dictionary that maps each of the Bitext tags to one of my 6 "
	"categories using exact string matching. For any tag that wasn't explicitly in the "
	"dictionary, I applied a keyword fallback: if the tag string contained words like "
	"'bill', 'charge', or 'payment', it was assigned to billing_issue, and so on for each "
	"category. This covered the vast majority of cases. About 973 rows used the fallback. "
	"In a production system, I would review these fallback assignments manually to ensure "
	"accuracy."
	),
	simple_text=(
	"I made a lookup table — like a translation dictionary. If a label was in the "
	"dictionary, I used that translation. If not, I tried to guess from the words in "
	"the label name. Like if a label said 'billing_adjustment', I could guess it belongs "
	"in the money/billing box because it contains the word 'billing'."
	),
	tip_text="Acknowledging the 973 fallback rows and saying you'd manually review them shows intellectual honesty."
	)

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 3 — THE MODELS
	# ===========================================================================
	story += sec("Section 3: The Models — How Did You Train Them?")

	story += cat(["Baseline Model Questions"])

	story += qa_block(
	question="You built two classifiers. What is the baseline and why did you build it?",
	answer_text=(
	"The baseline is a TF-IDF vectoriser combined with a Logistic Regression classifier, "
	"implemented as a single sklearn Pipeline. TF-IDF converts each message into a vector "
	"of numbers representing word importance scores. Logistic Regression then finds the "
	"linear decision boundary that separates the 6 classes. I built it first because: "
	"(1) it trains in milliseconds, (2) it provides a performance floor to compare against, "
	"and (3) it demonstrates that I understand when simpler models are appropriate."
	),
	simple_text=(
	"Before building the fancy robot, I built a simple one. The simple one counts which "
	"words appear in a message and uses that to guess the category. It's like a calculator "
	"vs a smartphone. I built the calculator first to prove the smartphone was actually "
	"worth building."
	),
	tip_text="Always justify your baseline. Interviewers want to see that you built it deliberately, not as an afterthought."
	)

	story += qa_block(
	question="What is TF-IDF?",
	answer_text=(
	"TF-IDF stands for Term Frequency — Inverse Document Frequency. TF measures how often "
	"a word appears in one document (high TF = word is frequent in this doc). IDF measures "
	"how rare the word is across all documents (high IDF = word is unique to few docs). "
	"Multiplying them gives a score that is high for words that are common in one document "
	"but rare across the whole dataset — these are the most informative words. Common words "
	"like 'the' or 'is' get near-zero scores because they appear everywhere."
	),
	simple_text=(
	"Imagine every word gets a score. A word that appears a lot in just ONE message "
	"gets a high score — it's special to that message. A word like 'the' that appears "
	"in every single message gets a low score — it tells us nothing. TF-IDF is just a "
	"formula for giving each word its specialness score."
	),
	tip_text="TF-IDF is a very common interview question. Learn this definition by heart."
	)

	story += qa_block(
	question="Your baseline (0.9958 F1) outperformed DistilBERT (0.9825 F1). How do you explain that?",
	answer_text=(
	"There are two reasons. First, the dataset itself: the Bitext dataset is professionally "
	"labelled and uses very consistent, formal language for each intent. TF-IDF word counts "
	"are perfectly sufficient to separate these clean categories — specific keywords almost "
	"uniquely identify each class. Second, the training constraint: I was running on CPU "
	"only, so I subsampled to 3,000 training examples and capped training at 300 steps. "
	"DistilBERT trained on the full dataset with more epochs would likely match or exceed "
	"the baseline. The baseline advantage is a dataset characteristic, not evidence that "
	"DistilBERT is a worse model."
	),
	simple_text=(
	"The fancy robot did slightly worse because I couldn't let it study for long enough — "
	"it only had 300 practice rounds instead of thousands. The simple robot was good enough "
	"for this particular test because the messages in the dataset use very predictable words. "
	"If we had messier, real-world messages, the fancy robot would win."
	),
	tip_text=(
	"This is almost guaranteed to come up. Interviewers love testing whether you understand "
	"your own results. The two-part answer (dataset quality + training constraint) is impressive."
	)
	)

	story += cat(["DistilBERT & Fine-Tuning Questions"])

	story += qa_block(
	question="What is DistilBERT and why did you choose it?",
	answer_text=(
	"DistilBERT is a smaller, faster version of BERT (Bidirectional Encoder Representations "
	"from Transformers) created by HuggingFace using a technique called knowledge distillation. "
	"It retains 97% of BERT's language understanding while being 40% smaller and 60% faster. "
	"I chose it over full BERT because: (1) I was training on CPU, so speed and memory matter, "
	"(2) 97% performance retention is sufficient for a classification task, and (3) it is "
	"a production-proven model with excellent HuggingFace support."
	),
	simple_text=(
	"BERT is a very smart robot brain that has read millions of books and websites. "
	"DistilBERT is BERT's younger sibling — 40% smaller, almost just as smart. I picked "
	"the little sibling because it runs faster on my computer, and for sorting six categories "
	"of messages, the little sibling is smart enough."
	),
	tip_text="Justify model choice with concrete numbers (40% smaller, 97% performance, 60% faster). Don't just say 'it's popular'."
	)

	story += qa_block(
	question="What is fine-tuning and what does it mean to fine-tune DistilBERT?",
	answer_text=(
	"Fine-tuning means taking a pre-trained model — one that has already learned general "
	"language understanding from a massive text corpus — and continuing to train it on a "
	"smaller, task-specific dataset. The pre-trained model already knows grammar, context, "
	"and word meanings. Fine-tuning teaches it the specifics of your task. For DistilBERT, "
	"this means: (1) loading the pre-trained weights, (2) adding a classification head "
	"(a new linear layer that outputs 6 class probabilities), and (3) training the entire "
	"model end-to-end on the labelled customer support data."
	),
	simple_text=(
	"Imagine you hire someone who already speaks fluent English and has read every book "
	"ever written. Fine-tuning is like giving that person a one-week crash course on "
	"customer support specifically. They already know words and sentences — you just "
	"teach them your specific job. Much faster than training someone from scratch."
	),
	tip_text="Use the 'pre-trained + task-specific' framing. It's the standard mental model for fine-tuning."
	)

	story += qa_block(
	question="What is a classification head?",
	answer_text=(
	"A classification head is a simple linear layer added on top of a pre-trained model. "
	"DistilBERT's core outputs a 768-dimensional vector (called the [CLS] token embedding) "
	"that represents the meaning of the entire input sentence. The classification head "
	"multiplies this 768-dimensional vector by a weight matrix to produce 6 output "
	"scores (one per class), then applies softmax to convert them into probabilities. "
	"During fine-tuning, both the DistilBERT weights and the classification head weights "
	"are updated."
	),
	simple_text=(
	"DistilBERT reads a sentence and produces a big list of 768 numbers that summarises "
	"the meaning. The classification head is like a voting machine — it takes those 768 "
	"numbers, does some maths, and outputs 6 scores: 'billing: 80%, login: 5%, ...' "
	"The highest score wins and becomes the prediction."
	),
	tip_text="Knowing the dimension (768) and that softmax converts logits to probabilities is a strong technical detail."
	)

	story += qa_block(
	question="What hyperparameters did you tune and why?",
	answer_text=(
	"Key hyperparameters: learning_rate=2e-5 (standard for BERT fine-tuning; too high "
	"causes catastrophic forgetting, too low means no learning), max_length=128 tokens "
	"(sufficient for short support queries, reduces memory), batch_size=16 (balance "
	"between gradient quality and memory on CPU), max_steps=300 (CPU-adaptive cap to "
	"complete training in reasonable time), warmup_steps=int(0.1 * max_steps) (prevents "
	"large gradient updates in early training when weights are random). These are "
	"standard recommendations from the original BERT paper, adapted for CPU constraints."
	),
	simple_text=(
	"Hyperparameters are like the settings on an oven before you bake a cake. "
	"Learning rate is how fast the robot adjusts — too fast and it forgets everything, "
	"too slow and it never learns. Batch size is how many examples it looks at "
	"before updating. Warmup steps is a gentle warm-up period, like stretching "
	"before exercise."
	),
	tip_text=(
	"Always be able to explain WHY you set each hyperparameter, not just what you set it to. "
	"'2e-5 is standard for BERT fine-tuning per the original paper' is a strong answer."
	)
	)

	story += qa_block(
	question="How did you handle training on CPU only?",
	answer_text=(
	"I implemented automatic hardware detection at the start of training using "
	"torch.cuda.is_available(). When no GPU is detected, the training script activates "
	"two adaptive strategies: (1) Data subsampling — it stratified-samples 3,000 examples "
	"from the full training set rather than training on all 18,000, ensuring all 6 classes "
	"remain represented; (2) Step capping — it sets max_steps=300 instead of training for "
	"multiple full epochs. This reduces training time from ~20 hours to ~20 minutes while "
	"still producing a functional model."
	),
	simple_text=(
	"Training a big neural network without a GPU is like running a marathon on crutches — "
	"very slow. So I wrote code that detects 'no GPU found' and automatically switches "
	"to a faster, smaller version of the training: fewer examples, fewer steps. "
	"The robot doesn't learn as much, but it learns enough, and it finishes in 20 minutes "
	"instead of 20 hours."
	),
	tip_text="This shows engineering pragmatism — you adapted to constraints rather than just failing. Interviewers love this."
	)

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 4 — THE PIPELINE
	# ===========================================================================
	story += sec("Section 4: The Pipeline — How Does It All Connect?")

	story += cat(["Architecture Questions"])

	story += qa_block(
	question="Walk me through what happens when a customer sends a message.",
	answer_text=(
	"1. The raw customer query arrives at SupportAgent.resolve(). "
	"2. IntentClassifier.predict() tokenises the text, runs it through DistilBERT, "
	"and returns the top predicted intent label plus a confidence score (softmax probability). "
	"3. If confidence is below 0.70, the agent sets requires_human=True and returns a "
	"flag for human review without calling the LLM. "
	"4. Otherwise, get_template() fetches the intent-specific prompt template. "
	"format_user_prompt() fills in the customer query. "
	"5. ResponseGenerator.generate() sends the system prompt and user prompt to "
	"Claude via the Anthropic API and receives the generated response. "
	"6. The agent returns a dict containing the query, intent, confidence, response, "
	"context, and human_review flag."
	),
	simple_text=(
	"Step 1: Customer writes a message. Step 2: Robot 1 reads it and decides which "
	"of 6 boxes it belongs to (and how sure it is). Step 3: If the robot is not sure "
	"enough (less than 70% confident), it raises a flag and a real human will handle it. "
	"Step 4: If the robot is sure, it picks the right letter template for that topic. "
	"Step 5: Robot 2 (Claude) reads the template and writes a personalised reply. "
	"Step 6: The full reply plus all the details are returned."
	),
	tip_text="Practice saying this as a numbered list out loud. Being able to narrate a system end-to-end is a strong interview skill."
	)

	story += qa_block(
	question="What is prompt engineering and how did you use it?",
	answer_text=(
	"Prompt engineering is the practice of crafting input text to an LLM to guide it "
	"toward producing a desired output. In this project, I designed 6 intent-specific "
	"prompt templates, each with a system prompt (setting the LLM's role and tone) and "
	"a user prompt (providing the customer query plus intent-specific guidance). "
	"For example, the billing_issue template instructs the model to acknowledge the "
	"financial concern, show empathy, and offer concrete next steps. This structured "
	"approach ensures consistent, on-brand responses without requiring the LLM to guess "
	"the appropriate tone and content."
	),
	simple_text=(
	"Prompt engineering is writing good instructions for the robot. Instead of just "
	"saying 'write a reply', I say 'you are a friendly support agent, the customer has "
	"a billing problem, be empathetic, offer to help fix it'. The better your instructions, "
	"the better the robot's answer."
	),
	tip_text="Mention that you have 6 separate templates, not one generic one. This shows attention to detail."
	)

	story += qa_block(
	question="Why does the system flag low-confidence predictions for human review?",
	answer_text=(
	"The confidence threshold (0.70) acts as a safety net. When the classifier's softmax "
	"probability for the top class is below 70%, it indicates the model is uncertain — "
	"the input may be ambiguous, out-of-distribution, or phrased in a way the model "
	"hasn't seen. Sending an uncertain intent to the LLM would generate a response built "
	"on a potentially wrong context, which could mislead or frustrate the customer. "
	"Flagging for human review prevents poor automated responses from reaching customers "
	"while still automating the confident majority."
	),
	simple_text=(
	"Imagine asking the sorting robot 'are you sure?' — if it's less than 70% sure, "
	"it says 'I'm not confident, a human should handle this one'. This is important "
	"because if the robot sorts the message into the wrong box, the reply will be "
	"totally wrong. Better to get a human than to send a bad automated reply."
	),
	tip_text="This shows you designed for real-world use, not just accuracy metrics. Production-readiness thinking."
	)

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 5 — EVALUATION
	# ===========================================================================
	story += sec("Section 5: Evaluation — How Do You Know It Works?")

	story += cat(["Metrics & Evaluation Questions"])

	story += qa_block(
	question="What is weighted F1 score and why did you use it?",
	answer_text=(
	"F1 score is the harmonic mean of precision and recall. Precision asks: of all the "
	"messages I labelled as 'billing_issue', how many actually were? Recall asks: of all "
	"the actual billing_issue messages, how many did I catch? The harmonic mean penalises "
	"imbalanced precision/recall more than the arithmetic mean. Weighted F1 averages the "
	"per-class F1 scores, weighting each class by its number of examples. I chose weighted "
	"F1 over accuracy because it better handles class imbalance — accuracy alone can be "
	"misleadingly high if one class dominates."
	),
	simple_text=(
	"Imagine a test where 90% of questions are easy and 10% are hard. If you only answer "
	"the easy ones, you score 90% but you're failing on the hard ones. F1 score checks "
	"BOTH whether your answers are correct AND whether you answered all the questions — "
	"not just the easy majority."
	),
	tip_text="Knowing why you chose F1 over accuracy is a very common interview question. Always have this answer ready."
	)

	story += qa_block(
	question="What is RAGAS and how did you use it?",
	answer_text=(
	"RAGAS (Retrieval-Augmented Generation Assessment) is an open-source evaluation "
	"framework originally designed to measure the quality of RAG pipeline outputs. "
	"It provides metrics including Faithfulness (does the response stay within the "
	"provided context?) and Answer Relevancy (does the response address the question?). "
	"I initially attempted to use the RAGAS library but encountered dependency conflicts "
	"— it required OpenAI embeddings by default. I ultimately implemented the same metrics "
	"directly using Claude Haiku as the evaluator LLM, bypassing the library while "
	"preserving the conceptual framework."
	),
	simple_text=(
	"RAGAS is a tool for grading AI replies. Faithfulness asks: did the robot stick to "
	"what it was told, or did it make things up? Answer Relevancy asks: did the robot "
	"actually answer the question? I tried using the RAGAS tool but it had technical "
	"problems, so I built my own version that does the same grading."
	),
	tip_text="Be upfront about the dependency issue and your workaround. Showing problem-solving is better than hiding struggles."
	)

	story += qa_block(
	question="Your faithfulness score was 0.667, below the 0.85 target. Is that a failure?",
	answer_text=(
	"Not in this context. Faithfulness in RAGAS measures whether the generated response "
	"is grounded in the provided context document. In a RAG system with a knowledge base, "
	"a low faithfulness score means the model hallucinated facts. But in this system, "
	"the 'context' is a prompt template with minimal content — it contains guidance and "
	"tone instructions, not a database of facts. Claude is expected to generate helpful "
	"domain knowledge (like explaining billing processes) that is not literally in the "
	"template. This is correct, desirable behaviour. The more meaningful metric here is "
	"Answer Relevancy (0.837), which passed its target of 0.80."
	),
	simple_text=(
	"Faithfulness is like asking 'did the robot only use words from the instruction card?' "
	"But our instruction card only has general guidelines, not specific facts. So when "
	"the robot adds helpful details (like how to reset a password), it 'fails' faithfulness "
	"even though its answer was actually great. The more important score — did it answer "
	"the right question? — passed with 0.837."
	),
	tip_text=(
	"This is the most nuanced result in the project. Interviewers who see the 0.667 will "
	"test you on it. Have this explanation ready and be confident — you are NOT making excuses, "
	"you are correctly identifying a metric limitation."
	)
	)

	story += qa_block(
	question="How did you evaluate the LLM-generated responses?",
	answer_text=(
	"I implemented a custom synchronous evaluator using Claude Haiku as the judge LLM. "
	"For each of the 50 test responses, I sent two evaluation prompts to Claude Haiku: "
	"one asking it to score faithfulness (0.0-1.0) and one asking it to score answer "
	"relevancy (0.0-1.0). Each prompt asked for only a single decimal number in the reply "
	"(max_tokens=10, temperature=0 for determinism). I then computed mean, median, std, "
	"min, and max across all 50 scores. Results were saved to results/ragas_scores.json."
	),
	simple_text=(
	"I used a second AI (Claude Haiku) to grade the first AI's answers. For each answer, "
	"I asked Haiku two questions: 'How well does this answer stick to the topic? Score "
	"0 to 1' and 'How well does this answer address what the customer asked? Score 0 to 1'. "
	"Then I averaged all 50 scores to get the final grade."
	),
	tip_text="LLM-as-judge evaluation is a hot topic in 2024-2026. Knowing why you use temperature=0 for evaluation (reproducibility) is a great detail."
	)

	story += qa_block(
	question="What is the difference between precision and recall?",
	answer_text=(
	"Precision: of everything the model labelled as class X, what fraction actually is X? "
	"High precision = few false positives. Recall: of everything that actually is class X, "
	"what fraction did the model correctly identify? High recall = few false negatives. "
	"There is usually a trade-off: tuning for higher recall means accepting more false "
	"positives, and vice versa. The right balance depends on the cost of each error type. "
	"In a medical diagnosis context, high recall (catch all real cases) matters more. "
	"In a spam filter, high precision (don't block real emails) matters more."
	),
	simple_text=(
	"Precision: if the robot says 'this is a cat', how often is it actually a cat? "
	"Recall: of all the real cats, how many did the robot notice? "
	"A robot that calls everything a cat has perfect recall (it never misses a cat) "
	"but terrible precision (most of what it calls cats are dogs). "
	"You need both to be good."
	),
	tip_text="The medical/spam example is a classic way to make precision/recall trade-offs concrete. Use it."
	)

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 6 — CHALLENGES
	# ===========================================================================
	story += sec("Section 6: Challenges & Problem Solving")

	story += cat(["'Tell Me About a Challenge' Questions"])

	story += qa_block(
	question="What was the hardest technical problem you faced and how did you solve it?",
	answer_text=(
	"The most significant challenge was the RAGAS evaluation framework's hard dependency "
	"on OpenAI. After installing RAGAS and configuring the Anthropic LLM wrapper, the "
	"library still tried to call OpenAI for embedding-based metrics. Attempts to swap "
	"in HuggingFace embeddings via LangchainEmbeddingsWrapper also failed due to RAGAS's "
	"internal async timeout handling. Rather than spending hours debugging a third-party "
	"library, I made the decision to implement the same conceptual metrics — faithfulness "
	"and answer relevancy — as a direct, synchronous Anthropic API loop. This removed "
	"the dependency entirely, eliminated the async timeout issue, and produced cleaner, "
	"more interpretable results."
	),
	simple_text=(
	"I tried to use a ready-made grading tool (RAGAS) but it secretly required a "
	"different AI service (OpenAI) that I wasn't using. No matter what I tried, "
	"it kept asking for that service. So instead of fighting it, I built my own "
	"grading tool from scratch in 100 lines of code. My version was actually simpler "
	"and worked better."
	),
	tip_text="This answer shows debugging skill, good judgment (knowing when to stop debugging), and resourcefulness. Lead with the challenge, end with the solution."
	)

	story += qa_block(
	question="How did you deal with the slow CPU training problem?",
	answer_text=(
	"The naive training run would have taken 20+ hours on CPU — clearly impractical. "
	"I solved it with two changes: (1) Automatic detection — the code checks "
	"torch.cuda.is_available() and activates 'CPU mode' when no GPU is found. "
	"(2) Adaptive parameters — in CPU mode, training data is stratified-subsampled "
	"to 3,000 examples and max_steps is capped at 300. This reduces training time to "
	"~20 minutes while still producing a model with 0.9825 F1, which proves the approach "
	"is sound. The config file exposes cpu_train_sample and cpu_max_steps as tunable "
	"parameters so they can be adjusted."
	),
	simple_text=(
	"Training the robot normally would take 20 hours without a special graphics card. "
	"I wrote code that detects the slow computer and automatically switches to a "
	"faster mini-training mode: less data, fewer rounds. The robot doesn't become "
	"as expert, but it still gets a 98.25% score, which proves the idea works. "
	"It's like practicing for a marathon by running 5km — you prove you can run, "
	"even if you haven't run the full 42km yet."
	),
	tip_text="Framing this as intentional engineering (not a workaround) is important. You made a pragmatic trade-off, not a mistake."
	)

	story += qa_block(
	question="sklearn 1.8 removed the multi_class parameter. How did you handle a breaking change?",
	answer_text=(
	"When I ran the baseline training script, it threw a TypeError: "
	"LogisticRegression.__init__() got an unexpected keyword argument 'multi_class'. "
	"This is because sklearn 1.8 removed the deprecated multi_class='multinomial' "
	"parameter. The fix was simple — remove the parameter from both the code and config. "
	"Modern sklearn's LogisticRegression automatically handles multiclass problems using "
	"the one-vs-rest scheme by default, which produces equivalent results. This was a "
	"lesson in keeping requirements pinned in production to prevent unexpected breakage."
	),
	simple_text=(
	"A tool I was using (sklearn) got an update that removed a setting I was using. "
	"The computer gave me an error saying it didn't recognise that setting anymore. "
	"I looked it up and found out the new version doesn't need that setting — it "
	"figures it out automatically. So I deleted that line of code and everything worked. "
	"Lesson learned: always write down exactly which version of each tool you're using."
	),
	tip_text="Handling a library breaking change gracefully and learning from it is a great story for a behavioural question."
	)

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 7 — PRODUCTION THINKING
	# ===========================================================================
	story += sec("Section 7: Production & Real-World Thinking")

	story += cat(["Scalability & Production Questions"])

	story += qa_block(
	question="How would you deploy this system in production?",
	answer_text=(
	"A production deployment would involve: (1) Serving the classifier as a REST API "
	"using FastAPI, with the model loaded into memory at startup and a /predict endpoint. "
	"(2) Containerising with Docker so the model and all dependencies are portable. "
	"(3) Deploying to a cloud provider (AWS, GCP, or Azure) with auto-scaling based on "
	"request volume. (4) Implementing a message queue (e.g. SQS or Kafka) if volume is "
	"high, so requests are processed asynchronously. (5) Caching the LLM response for "
	"duplicate or near-duplicate queries to reduce Anthropic API costs. "
	"(6) Adding monitoring/logging (latency, error rate, intent distribution) with tools "
	"like Prometheus/Grafana or Datadog."
	),
	simple_text=(
	"To put this in a real company, I would: wrap it in a web address so other apps "
	"can call it, package it in a box (Docker) so it runs anywhere, put it on a cloud "
	"computer that can grow bigger when more people use it, save common replies so we "
	"don't call the expensive AI every time, and add a dashboard showing how well it's "
	"working every day."
	),
	tip_text="Even if you haven't deployed it, showing you KNOW how to deploy it is enough. Mention FastAPI, Docker, and monitoring."
	)

	story += qa_block(
	question="How would you monitor this system once deployed?",
	answer_text=(
	"Monitoring would cover three layers: (1) Infrastructure metrics — latency, error rate, "
	"throughput (standard APM). (2) ML metrics — intent distribution drift (if billing_issue "
	"suddenly spikes, something changed), average confidence score over time (confidence drop "
	"may indicate the model is seeing new types of queries it wasn't trained on), and "
	"human_review escalation rate. (3) Business metrics — customer satisfaction, resolution "
	"time, re-contact rate. I would also implement periodic re-evaluation: run new queries "
	"through the LLM judge and alert if relevancy drops below threshold."
	),
	simple_text=(
	"Monitoring is like a health check for the robot. I'd watch: is it fast enough? "
	"Is it confident? Are more messages than usual going to humans for review? "
	"Are customers satisfied with the replies? If any of these go wrong, "
	"it might mean the robot needs to be retrained or fixed."
	),
	tip_text="Mentioning concept drift (confidence drops, distribution shifts) shows senior ML engineering knowledge."
	)

	story += qa_block(
	question="How would you improve the model if given more resources?",
	answer_text=(
	"With a GPU: train on the full 18,000+ example dataset for 3-5 epochs with proper "
	"hyperparameter search (learning rate, batch size). "
	"With more data: collect real customer support tickets, which are messier than the "
	"Bitext dataset and would better reflect production distribution. "
	"Architecturally: (1) implement retrieval-augmented generation — instead of static "
	"prompt templates, retrieve relevant FAQ articles or resolution histories; "
	"(2) add a re-ranking step to select the best candidate response from multiple "
	"LLM generations; (3) implement active learning — flag uncertain predictions, "
	"have humans label them, and retrain periodically."
	),
	simple_text=(
	"With a proper gaming computer: train the robot on all the data, not just a sample. "
	"With real company data: teach the robot using actual past customer conversations. "
	"With more time: instead of using a fixed template, let the robot look up real "
	"answers from the company's help pages. Like teaching someone to use a real "
	"reference book instead of memorising everything."
	),
	tip_text="RAG as a next step is a strong answer because it shows architectural thinking beyond fine-tuning."
	)

	story += qa_block(
	question="What is the cost of running this system at scale?",
	answer_text=(
	"The main cost is the Anthropic API for response generation. At the time of building "
	"this, Claude Sonnet costs approximately $3 per million input tokens and $15 per million "
	"output tokens. A typical support response exchange is ~500 input + ~200 output tokens, "
	"so roughly $0.0045 per resolved query. At 10,000 queries/day that is ~$45/day. "
	"The classifier inference cost is negligible once hosted — DistilBERT runs in ~21ms "
	"per query on CPU. Cost optimisation levers: use Claude Haiku for simple intents "
	"and Sonnet only for complex ones, implement response caching for common queries, "
	"or fine-tune a smaller model as a responder."
	),
	simple_text=(
	"The expensive part is asking Claude to write each reply — it costs a tiny amount "
	"per reply, but it adds up with millions of customers. The sorting robot is almost "
	"free to run. To save money: use the cheaper AI for easy questions, save common "
	"replies so you only pay once, and use the expensive AI only for tricky problems."
	),
	tip_text="Showing cost-awareness is impressive — it signals you think like a product engineer, not just a researcher."
	)

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 8 — BEHAVIOURAL QUESTIONS
	# ===========================================================================
	story += sec("Section 8: Behavioural Questions")

	story += cat(["STAR-Format Answers"])

	story.append(body(
	"Behavioural questions use the STAR format: Situation, Task, Action, Result. "
	"Each answer below is structured this way. Practice saying these out loud."
	))
	story.append(sp(8))

	story += qa_block(
	question="Tell me about a time you had to make a pragmatic decision under constraints.",
	answer_text=(
	"SITUATION: I was implementing the evaluation pipeline and had chosen RAGAS as the "
	"framework. After installation it threw OpenAI API errors despite being configured "
	"with Anthropic. "
	"TASK: I needed working evaluation metrics before I could report any results. "
	"ACTION: I investigated the root cause (RAGAS hardcoded OpenAI for embeddings, "
	"and its async architecture caused timeouts at the API rate limit). I concluded "
	"that patching a third-party library would take longer than building a clean "
	"alternative. I wrote a 100-line synchronous evaluator using Claude Haiku directly. "
	"RESULT: Clean, reproducible evaluation in 50 minutes wall-clock time, equivalent "
	"conceptual metrics, and no external dependencies. The decision to cut scope (drop "
	"the RAGAS library, keep the metric concepts) was the right engineering call."
	),
	simple_text=(
	"I tried to use a ready-made tool but it was broken for my use case. "
	"I had two choices: spend days fixing the broken tool, or spend one hour building "
	"a simpler version myself. I chose to build my own. It worked perfectly and "
	"I learned more by building it."
	),
	tip_text="This story shows: debugging skills, engineering judgment, bias for action, and pragmatism. It is one of the best stories in this project."
	)

	story += qa_block(
	question="Tell me about a time you had to explain something technical to a non-technical person.",
	answer_text=(
	"SITUATION: The confidence threshold concept — why the system escalates to humans — "
	"is technical but has a direct business impact. "
	"TASK: Explain it so a product manager or stakeholder could understand the design decision. "
	"ACTION: I framed it as 'the robot tells you when it's not sure'. I used the analogy "
	"of a new employee who, when unsure, asks their manager rather than guessing. "
	"The 70% threshold means: if the model's certainty is below 70%, a real human "
	"handles the ticket. "
	"RESULT: The stakeholder immediately understood both what the system does and why "
	"the fallback matters for customer experience, without needing to understand "
	"softmax probabilities."
	),
	simple_text=(
	"I explained that the robot says 'I'm not sure, a person should handle this' when "
	"it's less than 70% confident. Like a new cashier who, when they're unsure about a "
	"return policy, calls their manager rather than guessing and getting it wrong."
	),
	tip_text="Prepare a non-technical explanation of every key concept. Being able to bridge technical and business language is a senior skill."
	)

	story += qa_block(
	question="What would you do differently if you started this project again?",
	answer_text=(
	"Three things: First, I would pin all dependency versions immediately in "
	"requirements.txt to avoid breaking changes (like the sklearn multi_class issue). "
	"Second, I would design the evaluation framework before building the pipeline — "
	"knowing I'd need faithfulness and relevancy metrics upfront would have made "
	"me design better output schemas in the pipeline from the start. "
	"Third, I would collect a small real-world test set (actual customer messages from "
	"a live product) rather than splitting the training dataset — this gives a more "
	"honest estimate of production performance."
	),
	simple_text=(
	"I would: write down exactly which version of every tool I'm using before I start, "
	"plan how I'll test the results BEFORE building the robot (not after), "
	"and use real customer messages for the final test instead of ones from the "
	"same practice dataset."
	),
	tip_text="Showing genuine reflection, not fake humility ('I would've worked harder') is what recruiters want. These three specific things are credible."
	)

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 9 — RAPID FIRE
	# ===========================================================================
	story += sec("Section 9: Rapid-Fire Questions")

	story += cat(["Short, Confident Answers"])

	story.append(body(
	"These questions expect a 1-3 sentence answer. Practice answering each in under 20 seconds."
	))
	story.append(sp(6))

	rapid_fire = [
	("What is a transformer model?",
	"A neural network architecture that uses 'attention' to weigh how important each word "
	"is relative to every other word in a sentence, enabling much better language understanding "
	"than earlier sequential models like LSTMs.",
	"A robot brain that reads a whole sentence at once and figures out which words "
	"are most important based on all the other words around them."),

	("What is tokenisation?",
	"The process of splitting raw text into subword units (tokens) that the model can process. "
	"DistilBERT uses WordPiece tokenisation, which breaks rare words into common subword pieces "
	"to handle a fixed vocabulary.",
	"Chopping up a sentence into small pieces the robot can understand. 'unbelievable' "
	"might become ['un', '##believ', '##able'] — three pieces."),

	("What is softmax?",
	"A function that converts a vector of raw scores (logits) into a probability distribution "
	"summing to 1.0. Used as the final layer in classification to produce interpretable confidence scores.",
	"A calculator that takes a list of numbers and converts them into percentages that "
	"add up to 100%. So 'billing: 4.2, login: 0.3' becomes 'billing: 80%, login: 20%'."),

	("What is overfitting?",
	"When a model memorises the training data so well that it performs poorly on unseen data. "
	"It learns noise and specific examples rather than general patterns.",
	"The robot studied so hard for its practice test that it memorised all the exact "
	"questions. On the real test with different questions, it fails because it memorised "
	"instead of understanding."),

	("What is the difference between a language model and a classifier?",
	"A language model generates text (predicts the next token). A classifier assigns a "
	"label to an input from a fixed set of categories. DistilBERT here is used as a classifier "
	"(with a classification head), not as a generator. Claude is the language model.",
	"The classifier is like a sorting machine that puts things in boxes. "
	"The language model is like a writer that creates new text. "
	"This project uses both: one to sort, one to write."),

	("What is knowledge distillation?",
	"A technique where a smaller 'student' model is trained to mimic the outputs of a larger "
	"'teacher' model. DistilBERT was distilled from BERT: the student learns to match BERT's "
	"output distributions, not just the training labels.",
	"Like a wise teacher summarising all their knowledge into a compact book for a student. "
	"The student (DistilBERT) is smaller but very smart because it learned from the big teacher (BERT)."),

	("What is an epoch?",
	"One full pass through the entire training dataset. Training for 3 epochs means the model "
	"sees every training example 3 times. More epochs can improve performance but risk overfitting.",
	"The robot reading every single practice example once. Three epochs = the robot "
	"read the whole practice book three times."),

	("What is gradient descent?",
	"An optimisation algorithm that iteratively adjusts model weights in the direction that "
	"reduces the loss function. The learning rate controls the size of each step.",
	"Imagine rolling a ball down a hill to find the lowest point. Gradient descent "
	"is the maths that tells the robot which direction 'downhill' is, so it can improve "
	"its answers little by little."),

	("What is the Anthropic API?",
	"A REST API provided by Anthropic that allows developers to send messages to Claude models "
	"and receive generated text responses. It requires an API key and is billed per token.",
	"It's a way to talk to Claude (the AI) from your own program. You send a message, "
	"Claude sends back a reply. Like texting, but for code."),

	("What is a confusion matrix?",
	"A table showing predicted vs actual labels for a classifier. Rows are actual classes, "
	"columns are predicted classes. Diagonal cells are correct predictions; off-diagonal "
	"cells are misclassifications.",
	"A report card showing where the robot gets confused. If it often mixes up "
	"'billing_issue' and 'cancellation_request', those cells will be bright in the table."),
	]

	for question, answer_full, answer_simple in rapid_fire:
	story += [
	sp(4),
	q(question),
	sp(2),
	a(answer_full),
	sp(2),
	simple(answer_simple),
	sp(4),
	rule()
	]

	story.append(PageBreak())

	# ===========================================================================
	# SECTION 10 — QUESTIONS TO ASK
	# ===========================================================================
	story += sec("Section 10: Questions YOU Should Ask the Interviewer")

	story += cat(["Show Curiosity & Depth"])

	story.append(body(
	"Asking smart questions at the end of an interview shows genuine interest, "
	"seniority, and that you have thought beyond the code. Have at least 3-4 ready."
	))
	story.append(sp(10))

	questions_to_ask = [
	(
	"How do you currently handle intent classification in your customer support pipeline, "
	"and what are the biggest pain points?",
	"This shows you're thinking about real-world application and positioning your skills "
	"against actual problems they face. It also opens a dialogue about how your project "
	"experience is relevant."
	),
	(
	"What does your model evaluation and monitoring setup look like in production? "
	"How do you detect when a model starts degrading?",
	"This shows you think about the full ML lifecycle — not just training, but "
	"post-deployment health. It's a question a senior ML engineer would ask."
	),
	(
	"How do you balance automation confidence with the cost of human escalation? "
	"Where do you draw the line between automated response and human review?",
	"This ties directly to your project's confidence threshold design. "
	"It shows you understand the business trade-off, not just the technical one."
	),
	(
	"What is the main bottleneck in your current NLP/LLM pipeline — is it latency, "
	"accuracy, cost, or something else?",
	"This is a strategic question that shows you understand constraints. "
	"The answer will tell you a lot about the team's priorities."
	),
	(
	"How do you manage prompt versioning when you update templates that are live in production?",
	"This is a sharp, specific question about LLMOps. Most companies struggle with this "
	"and it shows you have thought about deployment realities beyond just building the model."
	),
	(
	"How does the team approach handling new intent categories that weren't in the original training set?",
	"This shows you understand model limitations (out-of-distribution inputs) and are "
	"thinking about long-term maintenance."
	),
	]

	for i, (q_text, why_text) in enumerate(questions_to_ask, 1):
	block = [
	sp(4),
	Paragraph(f"Question {i}:", CATEGORY_STYLE),
	Paragraph(f'"{q_text}"', ParagraphStyle(
	"QtoAsk", parent=styles["Normal"],
	fontSize=11, leading=16, textColor=colors.HexColor("#0f3460"),
	fontName="Helvetica-BoldOblique", leftIndent=10, spaceAfter=4,
	borderColor=colors.HexColor("#0f3460"), borderWidth=1,
	borderPad=8, backColor=colors.HexColor("#f0f4ff"), borderRadius=4
	)),
	sp(4),
	Paragraph(
	f"Why this works: {why_text}",
	ParagraphStyle(
	"WhyWorks", parent=styles["Normal"],
	fontSize=10, leading=14, textColor=colors.HexColor("#374151"),
	leftIndent=10, spaceAfter=6,
	backColor=colors.HexColor("#f9fafb"),
	borderColor=colors.HexColor("#d1d5db"), borderWidth=0.5,
	borderPad=6
	)
	),
	sp(4),
	rule()
	]
	story += block

	story.append(PageBreak())

	# ===========================================================================
	# QUICK REFERENCE CHEAT SHEET
	# ===========================================================================
	story += sec("Quick Reference — Key Numbers to Remember")

	story.append(body(
	"Memorise these numbers. Quoting exact results confidently makes a strong impression."
	))
	story.append(sp(10))

	cheat_sheet_data = [
	["Metric", "Value", "What It Means"],
	["Baseline Weighted F1", "0.9958", "TF-IDF + Logistic Regression accuracy"],
	["DistilBERT Weighted F1", "0.9825", "Fine-tuned transformer accuracy"],
	["Min per-class F1 (Baseline)", "0.985", "Worst single class performance"],
	["Min per-class F1 (DistilBERT)", "0.953", "Worst single class performance"],
	["Answer Relevancy", "0.837 (PASS)", "LLM responses address customer questions"],
	["Faithfulness", "0.667 (expected low)", "LLM generates beyond the template — intentional"],
	["Confidence threshold", "0.70", "Below this, route to human review"],
	["Training data size", "26,872 examples", "Full Bitext dataset"],
	["CPU training subsample", "3,000 examples", "Adaptive for CPU-only training"],
	["Training steps (CPU)", "300 steps", "~20 min on CPU"],
	["Evaluation queries", "50 queries", "RAGAS-style evaluation sample"],
	["Baseline model size", "0.4 MB", "TF-IDF + LR pickle"],
	["DistilBERT model size", "4,088 MB", "Fine-tuned transformer weights"],
	["Baseline inference", "0.15 ms/sample", "Extremely fast"],
	["DistilBERT inference", "21.18 ms/sample", "140x slower but much more capable"],
	["Intent categories", "6", "billing, account, technical, inquiry, cancellation, feedback"],
	["Test set queries (generation)", "200 queries", "Subsampled for LLM generation pipeline"],
	]

	cheat = Table(cheat_sheet_data, colWidths=[6cm, 4.5cm, 6*cm])
	cheat.setStyle(TableStyle([
	("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#0f3460")),
	("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
	("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
	("FONTSIZE", (0, 0), (-1, -1), 9),
	("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#f0f4ff"), colors.white]),
	("ALIGN", (1, 0), (1, -1), "CENTER"),
	("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
	("GRID", (0, 0), (-1, -1), 0.4, colors.HexColor("#dee2e6")),
	("TOPPADDING", (0, 0), (-1, -1), 6),
	("BOTTOMPADDING", (0, 0), (-1, -1), 6),
	("LEFTPADDING", (0, 0), (-1, -1), 6),
	# Highlight the pass/fail rows
	("TEXTCOLOR", (1, 6), (1, 6), colors.HexColor("#065f46")),
	("TEXTCOLOR", (1, 7), (1, 7), colors.HexColor("#92400e")),
	("FONTNAME", (1, 6), (1, 7), "Helvetica-Bold"),
	]))
	story.append(cheat)
	story.append(sp(16))

	# Final encouragement
	story += [
	HRFlowable(width="100%", thickness=2, color=colors.HexColor("#0f3460"), spaceAfter=12),
	Paragraph("You Built This. Own It.", ParagraphStyle(
	"Final", parent=styles["Normal"],
	fontSize=16, textColor=colors.HexColor("#0f3460"),
	fontName="Helvetica-Bold", alignment=TA_CENTER, spaceAfter=8
	)),
	Paragraph(
	"Every number in that cheat sheet came from code you wrote. "
	"Every decision — from the confidence threshold to the custom evaluator — "
	"was yours. When an interviewer asks about this project, you are the expert "
	"in the room. Speak with confidence.",
	ParagraphStyle(
	"FinalBody", parent=styles["Normal"],
	fontSize=11, leading=17, textColor=colors.HexColor("#374151"),
	alignment=TA_CENTER, spaceAfter=6
	)
	),
	]

	# ---------------------------------------------------------------------------
	# Build PDF
	# ---------------------------------------------------------------------------
	doc = SimpleDocTemplate(
	str(OUTPUT),
	pagesize=A4,
	leftMargin=2*cm,
	rightMargin=2*cm,
	topMargin=2.5*cm,
	bottomMargin=2.5*cm,
	title="Interview Prep — Customer Support AI",
	author="Claude Code",
	)
	doc.build(story)
	print(f"PDF saved -> {OUTPUT.resolve()}")