Spaces:

pro580
/

customer-support-agent

Running

File size: 68,139 Bytes

e323466

"""Generate a recruiter interview Q&A PDF for the intent classifier project.

Covers every likely question a recruiter or technical interviewer would ask,
with clear, simple answers explained as if to a 7-year-old — no jargon left
unexplained.
"""

import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parent.parent))

from reportlab.lib.pagesizes import A4
from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle
from reportlab.lib.units import cm
from reportlab.lib import colors
from reportlab.platypus import (
    SimpleDocTemplate, Paragraph, Spacer, PageBreak,
    Table, TableStyle, HRFlowable, KeepTogether
)
from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY

OUTPUT = Path("results/interview_prep.pdf")
OUTPUT.parent.mkdir(exist_ok=True)

# ---------------------------------------------------------------------------
# Styles
# ---------------------------------------------------------------------------
styles = getSampleStyleSheet()

TITLE_STYLE = ParagraphStyle(
    "ITitle", parent=styles["Title"],
    fontSize=30, textColor=colors.HexColor("#0f3460"),
    spaceAfter=12, alignment=TA_CENTER, fontName="Helvetica-Bold"
)
SUBTITLE_STYLE = ParagraphStyle(
    "ISubtitle", parent=styles["Normal"],
    fontSize=13, textColor=colors.HexColor("#533483"),
    spaceAfter=8, alignment=TA_CENTER
)
COVER_BODY = ParagraphStyle(
    "ICoverBody", parent=styles["Normal"],
    fontSize=11, leading=17, textColor=colors.HexColor("#1a1a2e"),
    alignment=TA_CENTER, spaceAfter=8
)
SECTION_STYLE = ParagraphStyle(
    "ISection", parent=styles["Heading1"],
    fontSize=18, textColor=colors.white,
    spaceBefore=16, spaceAfter=8,
    backColor=colors.HexColor("#0f3460"),
    borderPad=8, fontName="Helvetica-Bold"
)
CATEGORY_STYLE = ParagraphStyle(
    "ICategory", parent=styles["Heading2"],
    fontSize=13, textColor=colors.HexColor("#533483"),
    spaceBefore=14, spaceAfter=4, fontName="Helvetica-Bold"
)
Q_STYLE = ParagraphStyle(
    "IQuestion", parent=styles["Normal"],
    fontSize=11, leading=16, textColor=colors.HexColor("#0f3460"),
    spaceBefore=10, spaceAfter=3, fontName="Helvetica-Bold",
    backColor=colors.HexColor("#e8f4fd"),
    borderColor=colors.HexColor("#0f3460"),
    borderWidth=1, borderPad=7, borderRadius=4,
    leftIndent=0
)
A_STYLE = ParagraphStyle(
    "IAnswer", parent=styles["Normal"],
    fontSize=10, leading=16, textColor=colors.HexColor("#1a1a1a"),
    spaceBefore=4, spaceAfter=4, alignment=TA_JUSTIFY,
    leftIndent=8
)
SIMPLE_STYLE = ParagraphStyle(
    "ISimple", parent=styles["Normal"],
    fontSize=10, leading=15, textColor=colors.HexColor("#065f46"),
    spaceBefore=4, spaceAfter=6,
    backColor=colors.HexColor("#ecfdf5"),
    borderColor=colors.HexColor("#6ee7b7"),
    borderWidth=1, borderPad=6, borderRadius=3,
    leftIndent=8
)
TIP_STYLE = ParagraphStyle(
    "ITip", parent=styles["Normal"],
    fontSize=9.5, leading=14, textColor=colors.HexColor("#92400e"),
    spaceBefore=3, spaceAfter=6,
    backColor=colors.HexColor("#fffbeb"),
    borderColor=colors.HexColor("#fcd34d"),
    borderWidth=1, borderPad=5,
    leftIndent=8
)
BULLET_STYLE = ParagraphStyle(
    "IBullet", parent=styles["Normal"],
    fontSize=10, leading=15, textColor=colors.HexColor("#1a1a1a"),
    leftIndent=20, spaceAfter=3,
    bulletIndent=10
)
BODY_STYLE = ParagraphStyle(
    "IBody", parent=styles["Normal"],
    fontSize=10, leading=15, textColor=colors.HexColor("#374151"),
    spaceAfter=5, alignment=TA_JUSTIFY
)

# ---------------------------------------------------------------------------
# Helpers
# ---------------------------------------------------------------------------
def sec(title):
    return [Spacer(1, 10), Paragraph(f"  {title}", SECTION_STYLE), Spacer(1, 6)]

def cat(title):
    if isinstance(title, list):
        title = title[0]
    return [Paragraph(title, CATEGORY_STYLE), HRFlowable(width="100%", thickness=0.8,
            color=colors.HexColor("#533483"), spaceAfter=4)]

def q(text):
    return Paragraph(f"Q: {text}", Q_STYLE)

def a(text):
    return Paragraph(text, A_STYLE)

def simple(text):
    return Paragraph(f"  Simple version: {text}", SIMPLE_STYLE)

def tip(text):
    return Paragraph(f"  Interview Tip: {text}", TIP_STYLE)

def bul(text):
    return Paragraph(f"  - {text}", BULLET_STYLE)

def body(text):
    return Paragraph(text, BODY_STYLE)

def sp(n=8):
    return Spacer(1, n)

def rule():
    return HRFlowable(width="100%", thickness=0.4, color=colors.HexColor("#e5e7eb"), spaceAfter=6)

def qa_block(question, answer_text, simple_text="", tip_text="", bullets=None):
    """One complete Q&A block with optional simple version, tip, and bullets."""
    items = [sp(4), q(question), sp(3), a(answer_text)]
    if bullets:
        for b in bullets:
            items.append(bul(b))
    if simple_text:
        items.append(sp(2))
        items.append(simple(simple_text))
    if tip_text:
        items.append(sp(2))
        items.append(tip(tip_text))
    items.append(sp(4))
    items.append(rule())
    return items

# ---------------------------------------------------------------------------
# Build story
# ---------------------------------------------------------------------------
story = []

# ===== COVER PAGE =====
story += [
    sp(50),
    Paragraph("Interview Prep Guide", TITLE_STYLE),
    Paragraph("Customer Support AI — Intent Classifier Project", SUBTITLE_STYLE),
    sp(16),
    Paragraph(
        "This guide prepares you to answer any question a recruiter or technical interviewer "
        "might ask about your Customer Support AI project.",
        COVER_BODY
    ),
    sp(8),
    Paragraph(
        "Every answer is written twice: once in proper technical language, and once in "
        "super-simple language — the way you would explain it to a 7-year-old. "
        "Reading both will make the concept stick.",
        COVER_BODY
    ),
    sp(20),
]

# Summary box
cover_table = Table(
    [[
        Paragraph("30\nQuestions\nCovered", ParagraphStyle("ct", fontSize=13, alignment=TA_CENTER,
                  textColor=colors.white, fontName="Helvetica-Bold", leading=18)),
        Paragraph("5\nDifficulty\nLevels", ParagraphStyle("ct2", fontSize=13, alignment=TA_CENTER,
                  textColor=colors.white, fontName="Helvetica-Bold", leading=18)),
        Paragraph("Simple\nExplanation\nEvery Time", ParagraphStyle("ct3", fontSize=13,
                  alignment=TA_CENTER, textColor=colors.white, fontName="Helvetica-Bold", leading=18)),
    ]],
    colWidths=[5*cm, 5*cm, 5*cm]
)
cover_table.setStyle(TableStyle([
    ("BACKGROUND", (0, 0), (0, 0), colors.HexColor("#0f3460")),
    ("BACKGROUND", (1, 0), (1, 0), colors.HexColor("#533483")),
    ("BACKGROUND", (2, 0), (2, 0), colors.HexColor("#2d6a4f")),
    ("ALIGN", (0, 0), (-1, -1), "CENTER"),
    ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
    ("ROWBACKGROUNDS", (0, 0), (-1, -1), [None]),
    ("BOX", (0, 0), (-1, -1), 1, colors.white),
    ("INNERGRID", (0, 0), (-1, -1), 1, colors.white),
    ("TOPPADDING", (0, 0), (-1, -1), 14),
    ("BOTTOMPADDING", (0, 0), (-1, -1), 14),
]))
story.append(cover_table)
story.append(PageBreak())

# ===== TABLE OF CONTENTS =====
story += sec("Table of Contents")
toc_data = [
    ["Section", "Topic", "Page"],
    ["1", "The Big Picture — What Did You Build?", "3"],
    ["2", "The Data — Where Did It Come From?", "5"],
    ["3", "The Models — How Did You Train Them?", "7"],
    ["4", "The Pipeline — How Does It All Connect?", "11"],
    ["5", "Evaluation — How Do You Know It Works?", "13"],
    ["6", "Challenges & Problem Solving", "16"],
    ["7", "Production & Real-World Thinking", "18"],
    ["8", "Behavioural Questions", "21"],
    ["9", "Rapid-Fire Questions (Short Answers)", "23"],
    ["10", "Questions YOU Should Ask the Interviewer", "25"],
]
toc = Table(toc_data, colWidths=[1.5*cm, 12*cm, 2.5*cm])
toc.setStyle(TableStyle([
    ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#0f3460")),
    ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
    ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
    ("FONTSIZE", (0, 0), (-1, 0), 10),
    ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#f8f9fa"), colors.white]),
    ("FONTSIZE", (0, 1), (-1, -1), 10),
    ("ALIGN", (0, 0), (0, -1), "CENTER"),
    ("ALIGN", (2, 0), (2, -1), "CENTER"),
    ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
    ("GRID", (0, 0), (-1, -1), 0.5, colors.HexColor("#dee2e6")),
    ("TOPPADDING", (0, 0), (-1, -1), 7),
    ("BOTTOMPADDING", (0, 0), (-1, -1), 7),
    ("LEFTPADDING", (0, 0), (-1, -1), 8),
]))
story += [toc, PageBreak()]

# ===========================================================================
# SECTION 1 — THE BIG PICTURE
# ===========================================================================
story += sec("Section 1: The Big Picture — What Did You Build?")

story += cat(["Overview Questions"])

story += qa_block(
    question="Can you give me a 60-second summary of this project?",
    answer_text=(
        "I built a two-stage automated customer support system. In stage one, a fine-tuned "
        "DistilBERT model reads an incoming customer message and classifies it into one of six "
        "intent categories — things like billing issues, account access problems, or cancellation "
        "requests. In stage two, the predicted intent is passed as context to Claude (an Anthropic "
        "LLM), which then generates a helpful, human-sounding support response tailored to that "
        "specific intent. The system also flags low-confidence predictions for human review. I "
        "evaluated the full pipeline using a custom LLM-based scoring framework for faithfulness "
        "and answer relevancy, achieving 0.837 answer relevancy on 50 test queries."
    ),
    simple_text=(
        "Imagine a robot postbox at a company. When a customer sends a message, the robot reads "
        "it and puts it in one of six boxes — like 'money problems' or 'can't log in'. Then a "
        "second, smarter robot writes a kind reply based on which box it went into. I built both "
        "robots and tested how well they work."
    ),
    tip_text="Always open with: what it does, how it works, and one key result. This answer does all three."
)

story += qa_block(
    question="Why did you choose this project?",
    answer_text=(
        "Customer support automation is a genuine industry problem — companies spend billions "
        "on support operations and response quality is inconsistent. This project let me practice "
        "the full ML lifecycle in one place: data engineering, fine-tuning a transformer model, "
        "prompt engineering with a production LLM, evaluation framework design, and packaging "
        "everything into a reproducible pipeline. It also demonstrates that I understand both "
        "classical NLP (TF-IDF baseline) and modern deep learning approaches."
    ),
    simple_text=(
        "Customer support is expensive and slow. I wanted to build something that actually "
        "saves a company time and money. And it let me practice every important skill in one "
        "single project — like training for a sports competition by doing every exercise at once."
    ),
    tip_text="Show that you understood the business problem, not just the tech. Recruiters love this."
)

story += qa_block(
    question="What are the two stages of the pipeline?",
    answer_text=(
        "Stage 1 is the Intent Classifier: a DistilBERT transformer model fine-tuned on labelled "
        "customer support examples. It reads the raw customer query and outputs a predicted intent "
        "label plus a confidence score. Stage 2 is the Response Generator: an Anthropic Claude "
        "model that receives the original query plus a structured prompt template filled with "
        "intent-specific guidance, and produces a personalised support response. The two stages "
        "are chained in the SupportAgent class."
    ),
    simple_text=(
        "Stage 1 is the SORTING robot — it reads the message and decides what kind of problem "
        "it is. Stage 2 is the WRITING robot — it reads the sorted message and writes a nice "
        "reply. They work together like a post office and a letter writer."
    ),
    tip_text="Draw this on a whiteboard if you get the chance. Diagrams make answers memorable."
)

story += qa_block(
    question="What are the 6 intent categories and how did you choose them?",
    answer_text=(
        "The six categories are: billing_issue (charges, refunds, payment problems), "
        "account_access (login, password, account management), technical_support (product "
        "or service problems, delivery), product_inquiry (information, compatibility, "
        "warranty), cancellation_request (cancelling orders or subscriptions), and "
        "general_feedback (complaints, suggestions, general questions). I derived these "
        "by analysing the Bitext customer support dataset's 50+ granular intent tags and "
        "grouping them into business-meaningful categories that a real support department "
        "would use to route tickets."
    ),
    simple_text=(
        "Think of it like sorting your toys into boxes: money box, login box, broken-thing box, "
        "asking-questions box, I-want-to-quit box, and other box. These six boxes cover almost "
        "everything a customer could ever message about."
    ),
    tip_text="Mention that the categories were business-driven, not just technically convenient. This shows maturity."
)

story.append(PageBreak())

# ===========================================================================
# SECTION 2 — THE DATA
# ===========================================================================
story += sec("Section 2: The Data — Where Did It Come From?")

story += cat(["Dataset Questions"])

story += qa_block(
    question="What dataset did you use and why?",
    answer_text=(
        "I used the Bitext Customer Support LLM Chatbot Training Dataset from HuggingFace, "
        "which contains 26,872 labelled customer support utterances across 50+ fine-grained "
        "intent categories. I chose it because it is publicly available, professionally "
        "labelled, representative of real support language, and large enough to fine-tune "
        "a transformer model reliably. It also covers a wide vocabulary of customer phrasings "
        "for the same intent, which helps the model generalise."
    ),
    simple_text=(
        "I found a big collection of 26,872 real customer messages on the internet. Each "
        "message already had a label saying what the customer wanted. It's like having a "
        "giant homework sheet that already has all the answers marked — perfect for teaching "
        "the robot."
    ),
    tip_text="Always know your dataset size, source, and why it was appropriate. These are standard first questions."
)

story += qa_block(
    question="How did you preprocess the data?",
    answer_text=(
        "Preprocessing involved three steps: (1) Text cleaning — converting text to lowercase, "
        "stripping non-ASCII characters, and normalising whitespace using regex. This reduces "
        "vocabulary noise without removing meaningful content. (2) Label mapping — the Bitext "
        "dataset has 50+ granular tags which I mapped to my 6 business categories using a "
        "keyword-based dictionary (LABEL_MAP). Labels that didn't match a keyword got assigned "
        "via a fallback heuristic. (3) Stratified splitting — I split the data 70/15/15 into "
        "train/validation/test sets using sklearn's train_test_split with stratify=label, "
        "ensuring all 6 classes are proportionally represented in every split."
    ),
    simple_text=(
        "I cleaned the messages (made everything lowercase, removed weird characters), "
        "then sorted the 50+ original label types into my 6 big categories, "
        "and finally split the data into three piles: a teaching pile, a practice pile, "
        "and a final exam pile."
    ),
    tip_text="Stratified splitting is an important detail that shows you understand class imbalance. Mention it confidently."
)

story += qa_block(
    question="What is stratified splitting and why does it matter?",
    answer_text=(
        "Stratified splitting means that when you divide your data into train, validation, "
        "and test sets, you ensure each set contains the same proportion of each class label "
        "as the original dataset. Without this, you might accidentally put all examples of a "
        "rare class into the training set and have none in the test set, making evaluation "
        "meaningless. sklearn's train_test_split with stratify=y handles this automatically."
    ),
    simple_text=(
        "Imagine you have 10 red balls and 90 blue balls. Stratified splitting means that "
        "no matter which pile you make, each pile has roughly 10% red and 90% blue. "
        "If you did it randomly, you might get a pile that's 100% blue and never test "
        "if the robot can recognise red ones."
    ),
    tip_text="This is a classic interview topic. Knowing why it matters (not just what it is) impresses interviewers."
)

story += qa_block(
    question="You mapped 50+ labels to 6. How did you handle ambiguous labels?",
    answer_text=(
        "I built a LABEL_MAP dictionary that maps each of the Bitext tags to one of my 6 "
        "categories using exact string matching. For any tag that wasn't explicitly in the "
        "dictionary, I applied a keyword fallback: if the tag string contained words like "
        "'bill', 'charge', or 'payment', it was assigned to billing_issue, and so on for each "
        "category. This covered the vast majority of cases. About 973 rows used the fallback. "
        "In a production system, I would review these fallback assignments manually to ensure "
        "accuracy."
    ),
    simple_text=(
        "I made a lookup table — like a translation dictionary. If a label was in the "
        "dictionary, I used that translation. If not, I tried to guess from the words in "
        "the label name. Like if a label said 'billing_adjustment', I could guess it belongs "
        "in the money/billing box because it contains the word 'billing'."
    ),
    tip_text="Acknowledging the 973 fallback rows and saying you'd manually review them shows intellectual honesty."
)

story.append(PageBreak())

# ===========================================================================
# SECTION 3 — THE MODELS
# ===========================================================================
story += sec("Section 3: The Models — How Did You Train Them?")

story += cat(["Baseline Model Questions"])

story += qa_block(
    question="You built two classifiers. What is the baseline and why did you build it?",
    answer_text=(
        "The baseline is a TF-IDF vectoriser combined with a Logistic Regression classifier, "
        "implemented as a single sklearn Pipeline. TF-IDF converts each message into a vector "
        "of numbers representing word importance scores. Logistic Regression then finds the "
        "linear decision boundary that separates the 6 classes. I built it first because: "
        "(1) it trains in milliseconds, (2) it provides a performance floor to compare against, "
        "and (3) it demonstrates that I understand when simpler models are appropriate."
    ),
    simple_text=(
        "Before building the fancy robot, I built a simple one. The simple one counts which "
        "words appear in a message and uses that to guess the category. It's like a calculator "
        "vs a smartphone. I built the calculator first to prove the smartphone was actually "
        "worth building."
    ),
    tip_text="Always justify your baseline. Interviewers want to see that you built it deliberately, not as an afterthought."
)

story += qa_block(
    question="What is TF-IDF?",
    answer_text=(
        "TF-IDF stands for Term Frequency — Inverse Document Frequency. TF measures how often "
        "a word appears in one document (high TF = word is frequent in this doc). IDF measures "
        "how rare the word is across all documents (high IDF = word is unique to few docs). "
        "Multiplying them gives a score that is high for words that are common in one document "
        "but rare across the whole dataset — these are the most informative words. Common words "
        "like 'the' or 'is' get near-zero scores because they appear everywhere."
    ),
    simple_text=(
        "Imagine every word gets a score. A word that appears a lot in just ONE message "
        "gets a high score — it's special to that message. A word like 'the' that appears "
        "in every single message gets a low score — it tells us nothing. TF-IDF is just a "
        "formula for giving each word its specialness score."
    ),
    tip_text="TF-IDF is a very common interview question. Learn this definition by heart."
)

story += qa_block(
    question="Your baseline (0.9958 F1) outperformed DistilBERT (0.9825 F1). How do you explain that?",
    answer_text=(
        "There are two reasons. First, the dataset itself: the Bitext dataset is professionally "
        "labelled and uses very consistent, formal language for each intent. TF-IDF word counts "
        "are perfectly sufficient to separate these clean categories — specific keywords almost "
        "uniquely identify each class. Second, the training constraint: I was running on CPU "
        "only, so I subsampled to 3,000 training examples and capped training at 300 steps. "
        "DistilBERT trained on the full dataset with more epochs would likely match or exceed "
        "the baseline. The baseline advantage is a dataset characteristic, not evidence that "
        "DistilBERT is a worse model."
    ),
    simple_text=(
        "The fancy robot did slightly worse because I couldn't let it study for long enough — "
        "it only had 300 practice rounds instead of thousands. The simple robot was good enough "
        "for this particular test because the messages in the dataset use very predictable words. "
        "If we had messier, real-world messages, the fancy robot would win."
    ),
    tip_text=(
        "This is almost guaranteed to come up. Interviewers love testing whether you understand "
        "your own results. The two-part answer (dataset quality + training constraint) is impressive."
    )
)

story += cat(["DistilBERT & Fine-Tuning Questions"])

story += qa_block(
    question="What is DistilBERT and why did you choose it?",
    answer_text=(
        "DistilBERT is a smaller, faster version of BERT (Bidirectional Encoder Representations "
        "from Transformers) created by HuggingFace using a technique called knowledge distillation. "
        "It retains 97% of BERT's language understanding while being 40% smaller and 60% faster. "
        "I chose it over full BERT because: (1) I was training on CPU, so speed and memory matter, "
        "(2) 97% performance retention is sufficient for a classification task, and (3) it is "
        "a production-proven model with excellent HuggingFace support."
    ),
    simple_text=(
        "BERT is a very smart robot brain that has read millions of books and websites. "
        "DistilBERT is BERT's younger sibling — 40% smaller, almost just as smart. I picked "
        "the little sibling because it runs faster on my computer, and for sorting six categories "
        "of messages, the little sibling is smart enough."
    ),
    tip_text="Justify model choice with concrete numbers (40% smaller, 97% performance, 60% faster). Don't just say 'it's popular'."
)

story += qa_block(
    question="What is fine-tuning and what does it mean to fine-tune DistilBERT?",
    answer_text=(
        "Fine-tuning means taking a pre-trained model — one that has already learned general "
        "language understanding from a massive text corpus — and continuing to train it on a "
        "smaller, task-specific dataset. The pre-trained model already knows grammar, context, "
        "and word meanings. Fine-tuning teaches it the specifics of your task. For DistilBERT, "
        "this means: (1) loading the pre-trained weights, (2) adding a classification head "
        "(a new linear layer that outputs 6 class probabilities), and (3) training the entire "
        "model end-to-end on the labelled customer support data."
    ),
    simple_text=(
        "Imagine you hire someone who already speaks fluent English and has read every book "
        "ever written. Fine-tuning is like giving that person a one-week crash course on "
        "customer support specifically. They already know words and sentences — you just "
        "teach them your specific job. Much faster than training someone from scratch."
    ),
    tip_text="Use the 'pre-trained + task-specific' framing. It's the standard mental model for fine-tuning."
)

story += qa_block(
    question="What is a classification head?",
    answer_text=(
        "A classification head is a simple linear layer added on top of a pre-trained model. "
        "DistilBERT's core outputs a 768-dimensional vector (called the [CLS] token embedding) "
        "that represents the meaning of the entire input sentence. The classification head "
        "multiplies this 768-dimensional vector by a weight matrix to produce 6 output "
        "scores (one per class), then applies softmax to convert them into probabilities. "
        "During fine-tuning, both the DistilBERT weights and the classification head weights "
        "are updated."
    ),
    simple_text=(
        "DistilBERT reads a sentence and produces a big list of 768 numbers that summarises "
        "the meaning. The classification head is like a voting machine — it takes those 768 "
        "numbers, does some maths, and outputs 6 scores: 'billing: 80%, login: 5%, ...' "
        "The highest score wins and becomes the prediction."
    ),
    tip_text="Knowing the dimension (768) and that softmax converts logits to probabilities is a strong technical detail."
)

story += qa_block(
    question="What hyperparameters did you tune and why?",
    answer_text=(
        "Key hyperparameters: learning_rate=2e-5 (standard for BERT fine-tuning; too high "
        "causes catastrophic forgetting, too low means no learning), max_length=128 tokens "
        "(sufficient for short support queries, reduces memory), batch_size=16 (balance "
        "between gradient quality and memory on CPU), max_steps=300 (CPU-adaptive cap to "
        "complete training in reasonable time), warmup_steps=int(0.1 * max_steps) (prevents "
        "large gradient updates in early training when weights are random). These are "
        "standard recommendations from the original BERT paper, adapted for CPU constraints."
    ),
    simple_text=(
        "Hyperparameters are like the settings on an oven before you bake a cake. "
        "Learning rate is how fast the robot adjusts — too fast and it forgets everything, "
        "too slow and it never learns. Batch size is how many examples it looks at "
        "before updating. Warmup steps is a gentle warm-up period, like stretching "
        "before exercise."
    ),
    tip_text=(
        "Always be able to explain WHY you set each hyperparameter, not just what you set it to. "
        "'2e-5 is standard for BERT fine-tuning per the original paper' is a strong answer."
    )
)

story += qa_block(
    question="How did you handle training on CPU only?",
    answer_text=(
        "I implemented automatic hardware detection at the start of training using "
        "torch.cuda.is_available(). When no GPU is detected, the training script activates "
        "two adaptive strategies: (1) Data subsampling — it stratified-samples 3,000 examples "
        "from the full training set rather than training on all 18,000, ensuring all 6 classes "
        "remain represented; (2) Step capping — it sets max_steps=300 instead of training for "
        "multiple full epochs. This reduces training time from ~20 hours to ~20 minutes while "
        "still producing a functional model."
    ),
    simple_text=(
        "Training a big neural network without a GPU is like running a marathon on crutches — "
        "very slow. So I wrote code that detects 'no GPU found' and automatically switches "
        "to a faster, smaller version of the training: fewer examples, fewer steps. "
        "The robot doesn't learn as much, but it learns enough, and it finishes in 20 minutes "
        "instead of 20 hours."
    ),
    tip_text="This shows engineering pragmatism — you adapted to constraints rather than just failing. Interviewers love this."
)

story.append(PageBreak())

# ===========================================================================
# SECTION 4 — THE PIPELINE
# ===========================================================================
story += sec("Section 4: The Pipeline — How Does It All Connect?")

story += cat(["Architecture Questions"])

story += qa_block(
    question="Walk me through what happens when a customer sends a message.",
    answer_text=(
        "1. The raw customer query arrives at SupportAgent.resolve(). "
        "2. IntentClassifier.predict() tokenises the text, runs it through DistilBERT, "
        "and returns the top predicted intent label plus a confidence score (softmax probability). "
        "3. If confidence is below 0.70, the agent sets requires_human=True and returns a "
        "flag for human review without calling the LLM. "
        "4. Otherwise, get_template() fetches the intent-specific prompt template. "
        "format_user_prompt() fills in the customer query. "
        "5. ResponseGenerator.generate() sends the system prompt and user prompt to "
        "Claude via the Anthropic API and receives the generated response. "
        "6. The agent returns a dict containing the query, intent, confidence, response, "
        "context, and human_review flag."
    ),
    simple_text=(
        "Step 1: Customer writes a message. Step 2: Robot 1 reads it and decides which "
        "of 6 boxes it belongs to (and how sure it is). Step 3: If the robot is not sure "
        "enough (less than 70% confident), it raises a flag and a real human will handle it. "
        "Step 4: If the robot is sure, it picks the right letter template for that topic. "
        "Step 5: Robot 2 (Claude) reads the template and writes a personalised reply. "
        "Step 6: The full reply plus all the details are returned."
    ),
    tip_text="Practice saying this as a numbered list out loud. Being able to narrate a system end-to-end is a strong interview skill."
)

story += qa_block(
    question="What is prompt engineering and how did you use it?",
    answer_text=(
        "Prompt engineering is the practice of crafting input text to an LLM to guide it "
        "toward producing a desired output. In this project, I designed 6 intent-specific "
        "prompt templates, each with a system prompt (setting the LLM's role and tone) and "
        "a user prompt (providing the customer query plus intent-specific guidance). "
        "For example, the billing_issue template instructs the model to acknowledge the "
        "financial concern, show empathy, and offer concrete next steps. This structured "
        "approach ensures consistent, on-brand responses without requiring the LLM to guess "
        "the appropriate tone and content."
    ),
    simple_text=(
        "Prompt engineering is writing good instructions for the robot. Instead of just "
        "saying 'write a reply', I say 'you are a friendly support agent, the customer has "
        "a billing problem, be empathetic, offer to help fix it'. The better your instructions, "
        "the better the robot's answer."
    ),
    tip_text="Mention that you have 6 separate templates, not one generic one. This shows attention to detail."
)

story += qa_block(
    question="Why does the system flag low-confidence predictions for human review?",
    answer_text=(
        "The confidence threshold (0.70) acts as a safety net. When the classifier's softmax "
        "probability for the top class is below 70%, it indicates the model is uncertain — "
        "the input may be ambiguous, out-of-distribution, or phrased in a way the model "
        "hasn't seen. Sending an uncertain intent to the LLM would generate a response built "
        "on a potentially wrong context, which could mislead or frustrate the customer. "
        "Flagging for human review prevents poor automated responses from reaching customers "
        "while still automating the confident majority."
    ),
    simple_text=(
        "Imagine asking the sorting robot 'are you sure?' — if it's less than 70% sure, "
        "it says 'I'm not confident, a human should handle this one'. This is important "
        "because if the robot sorts the message into the wrong box, the reply will be "
        "totally wrong. Better to get a human than to send a bad automated reply."
    ),
    tip_text="This shows you designed for real-world use, not just accuracy metrics. Production-readiness thinking."
)

story.append(PageBreak())

# ===========================================================================
# SECTION 5 — EVALUATION
# ===========================================================================
story += sec("Section 5: Evaluation — How Do You Know It Works?")

story += cat(["Metrics & Evaluation Questions"])

story += qa_block(
    question="What is weighted F1 score and why did you use it?",
    answer_text=(
        "F1 score is the harmonic mean of precision and recall. Precision asks: of all the "
        "messages I labelled as 'billing_issue', how many actually were? Recall asks: of all "
        "the actual billing_issue messages, how many did I catch? The harmonic mean penalises "
        "imbalanced precision/recall more than the arithmetic mean. Weighted F1 averages the "
        "per-class F1 scores, weighting each class by its number of examples. I chose weighted "
        "F1 over accuracy because it better handles class imbalance — accuracy alone can be "
        "misleadingly high if one class dominates."
    ),
    simple_text=(
        "Imagine a test where 90% of questions are easy and 10% are hard. If you only answer "
        "the easy ones, you score 90% but you're failing on the hard ones. F1 score checks "
        "BOTH whether your answers are correct AND whether you answered all the questions — "
        "not just the easy majority."
    ),
    tip_text="Knowing why you chose F1 over accuracy is a very common interview question. Always have this answer ready."
)

story += qa_block(
    question="What is RAGAS and how did you use it?",
    answer_text=(
        "RAGAS (Retrieval-Augmented Generation Assessment) is an open-source evaluation "
        "framework originally designed to measure the quality of RAG pipeline outputs. "
        "It provides metrics including Faithfulness (does the response stay within the "
        "provided context?) and Answer Relevancy (does the response address the question?). "
        "I initially attempted to use the RAGAS library but encountered dependency conflicts "
        "— it required OpenAI embeddings by default. I ultimately implemented the same metrics "
        "directly using Claude Haiku as the evaluator LLM, bypassing the library while "
        "preserving the conceptual framework."
    ),
    simple_text=(
        "RAGAS is a tool for grading AI replies. Faithfulness asks: did the robot stick to "
        "what it was told, or did it make things up? Answer Relevancy asks: did the robot "
        "actually answer the question? I tried using the RAGAS tool but it had technical "
        "problems, so I built my own version that does the same grading."
    ),
    tip_text="Be upfront about the dependency issue and your workaround. Showing problem-solving is better than hiding struggles."
)

story += qa_block(
    question="Your faithfulness score was 0.667, below the 0.85 target. Is that a failure?",
    answer_text=(
        "Not in this context. Faithfulness in RAGAS measures whether the generated response "
        "is grounded in the provided context document. In a RAG system with a knowledge base, "
        "a low faithfulness score means the model hallucinated facts. But in this system, "
        "the 'context' is a prompt template with minimal content — it contains guidance and "
        "tone instructions, not a database of facts. Claude is expected to generate helpful "
        "domain knowledge (like explaining billing processes) that is not literally in the "
        "template. This is correct, desirable behaviour. The more meaningful metric here is "
        "Answer Relevancy (0.837), which passed its target of 0.80."
    ),
    simple_text=(
        "Faithfulness is like asking 'did the robot only use words from the instruction card?' "
        "But our instruction card only has general guidelines, not specific facts. So when "
        "the robot adds helpful details (like how to reset a password), it 'fails' faithfulness "
        "even though its answer was actually great. The more important score — did it answer "
        "the right question? — passed with 0.837."
    ),
    tip_text=(
        "This is the most nuanced result in the project. Interviewers who see the 0.667 will "
        "test you on it. Have this explanation ready and be confident — you are NOT making excuses, "
        "you are correctly identifying a metric limitation."
    )
)

story += qa_block(
    question="How did you evaluate the LLM-generated responses?",
    answer_text=(
        "I implemented a custom synchronous evaluator using Claude Haiku as the judge LLM. "
        "For each of the 50 test responses, I sent two evaluation prompts to Claude Haiku: "
        "one asking it to score faithfulness (0.0-1.0) and one asking it to score answer "
        "relevancy (0.0-1.0). Each prompt asked for only a single decimal number in the reply "
        "(max_tokens=10, temperature=0 for determinism). I then computed mean, median, std, "
        "min, and max across all 50 scores. Results were saved to results/ragas_scores.json."
    ),
    simple_text=(
        "I used a second AI (Claude Haiku) to grade the first AI's answers. For each answer, "
        "I asked Haiku two questions: 'How well does this answer stick to the topic? Score "
        "0 to 1' and 'How well does this answer address what the customer asked? Score 0 to 1'. "
        "Then I averaged all 50 scores to get the final grade."
    ),
    tip_text="LLM-as-judge evaluation is a hot topic in 2024-2026. Knowing why you use temperature=0 for evaluation (reproducibility) is a great detail."
)

story += qa_block(
    question="What is the difference between precision and recall?",
    answer_text=(
        "Precision: of everything the model labelled as class X, what fraction actually is X? "
        "High precision = few false positives. Recall: of everything that actually is class X, "
        "what fraction did the model correctly identify? High recall = few false negatives. "
        "There is usually a trade-off: tuning for higher recall means accepting more false "
        "positives, and vice versa. The right balance depends on the cost of each error type. "
        "In a medical diagnosis context, high recall (catch all real cases) matters more. "
        "In a spam filter, high precision (don't block real emails) matters more."
    ),
    simple_text=(
        "Precision: if the robot says 'this is a cat', how often is it actually a cat? "
        "Recall: of all the real cats, how many did the robot notice? "
        "A robot that calls everything a cat has perfect recall (it never misses a cat) "
        "but terrible precision (most of what it calls cats are dogs). "
        "You need both to be good."
    ),
    tip_text="The medical/spam example is a classic way to make precision/recall trade-offs concrete. Use it."
)

story.append(PageBreak())

# ===========================================================================
# SECTION 6 — CHALLENGES
# ===========================================================================
story += sec("Section 6: Challenges & Problem Solving")

story += cat(["'Tell Me About a Challenge' Questions"])

story += qa_block(
    question="What was the hardest technical problem you faced and how did you solve it?",
    answer_text=(
        "The most significant challenge was the RAGAS evaluation framework's hard dependency "
        "on OpenAI. After installing RAGAS and configuring the Anthropic LLM wrapper, the "
        "library still tried to call OpenAI for embedding-based metrics. Attempts to swap "
        "in HuggingFace embeddings via LangchainEmbeddingsWrapper also failed due to RAGAS's "
        "internal async timeout handling. Rather than spending hours debugging a third-party "
        "library, I made the decision to implement the same conceptual metrics — faithfulness "
        "and answer relevancy — as a direct, synchronous Anthropic API loop. This removed "
        "the dependency entirely, eliminated the async timeout issue, and produced cleaner, "
        "more interpretable results."
    ),
    simple_text=(
        "I tried to use a ready-made grading tool (RAGAS) but it secretly required a "
        "different AI service (OpenAI) that I wasn't using. No matter what I tried, "
        "it kept asking for that service. So instead of fighting it, I built my own "
        "grading tool from scratch in 100 lines of code. My version was actually simpler "
        "and worked better."
    ),
    tip_text="This answer shows debugging skill, good judgment (knowing when to stop debugging), and resourcefulness. Lead with the challenge, end with the solution."
)

story += qa_block(
    question="How did you deal with the slow CPU training problem?",
    answer_text=(
        "The naive training run would have taken 20+ hours on CPU — clearly impractical. "
        "I solved it with two changes: (1) Automatic detection — the code checks "
        "torch.cuda.is_available() and activates 'CPU mode' when no GPU is found. "
        "(2) Adaptive parameters — in CPU mode, training data is stratified-subsampled "
        "to 3,000 examples and max_steps is capped at 300. This reduces training time to "
        "~20 minutes while still producing a model with 0.9825 F1, which proves the approach "
        "is sound. The config file exposes cpu_train_sample and cpu_max_steps as tunable "
        "parameters so they can be adjusted."
    ),
    simple_text=(
        "Training the robot normally would take 20 hours without a special graphics card. "
        "I wrote code that detects the slow computer and automatically switches to a "
        "faster mini-training mode: less data, fewer rounds. The robot doesn't become "
        "as expert, but it still gets a 98.25% score, which proves the idea works. "
        "It's like practicing for a marathon by running 5km — you prove you can run, "
        "even if you haven't run the full 42km yet."
    ),
    tip_text="Framing this as intentional engineering (not a workaround) is important. You made a pragmatic trade-off, not a mistake."
)

story += qa_block(
    question="sklearn 1.8 removed the multi_class parameter. How did you handle a breaking change?",
    answer_text=(
        "When I ran the baseline training script, it threw a TypeError: "
        "LogisticRegression.__init__() got an unexpected keyword argument 'multi_class'. "
        "This is because sklearn 1.8 removed the deprecated multi_class='multinomial' "
        "parameter. The fix was simple — remove the parameter from both the code and config. "
        "Modern sklearn's LogisticRegression automatically handles multiclass problems using "
        "the one-vs-rest scheme by default, which produces equivalent results. This was a "
        "lesson in keeping requirements pinned in production to prevent unexpected breakage."
    ),
    simple_text=(
        "A tool I was using (sklearn) got an update that removed a setting I was using. "
        "The computer gave me an error saying it didn't recognise that setting anymore. "
        "I looked it up and found out the new version doesn't need that setting — it "
        "figures it out automatically. So I deleted that line of code and everything worked. "
        "Lesson learned: always write down exactly which version of each tool you're using."
    ),
    tip_text="Handling a library breaking change gracefully and learning from it is a great story for a behavioural question."
)

story.append(PageBreak())

# ===========================================================================
# SECTION 7 — PRODUCTION THINKING
# ===========================================================================
story += sec("Section 7: Production & Real-World Thinking")

story += cat(["Scalability & Production Questions"])

story += qa_block(
    question="How would you deploy this system in production?",
    answer_text=(
        "A production deployment would involve: (1) Serving the classifier as a REST API "
        "using FastAPI, with the model loaded into memory at startup and a /predict endpoint. "
        "(2) Containerising with Docker so the model and all dependencies are portable. "
        "(3) Deploying to a cloud provider (AWS, GCP, or Azure) with auto-scaling based on "
        "request volume. (4) Implementing a message queue (e.g. SQS or Kafka) if volume is "
        "high, so requests are processed asynchronously. (5) Caching the LLM response for "
        "duplicate or near-duplicate queries to reduce Anthropic API costs. "
        "(6) Adding monitoring/logging (latency, error rate, intent distribution) with tools "
        "like Prometheus/Grafana or Datadog."
    ),
    simple_text=(
        "To put this in a real company, I would: wrap it in a web address so other apps "
        "can call it, package it in a box (Docker) so it runs anywhere, put it on a cloud "
        "computer that can grow bigger when more people use it, save common replies so we "
        "don't call the expensive AI every time, and add a dashboard showing how well it's "
        "working every day."
    ),
    tip_text="Even if you haven't deployed it, showing you KNOW how to deploy it is enough. Mention FastAPI, Docker, and monitoring."
)

story += qa_block(
    question="How would you monitor this system once deployed?",
    answer_text=(
        "Monitoring would cover three layers: (1) Infrastructure metrics — latency, error rate, "
        "throughput (standard APM). (2) ML metrics — intent distribution drift (if billing_issue "
        "suddenly spikes, something changed), average confidence score over time (confidence drop "
        "may indicate the model is seeing new types of queries it wasn't trained on), and "
        "human_review escalation rate. (3) Business metrics — customer satisfaction, resolution "
        "time, re-contact rate. I would also implement periodic re-evaluation: run new queries "
        "through the LLM judge and alert if relevancy drops below threshold."
    ),
    simple_text=(
        "Monitoring is like a health check for the robot. I'd watch: is it fast enough? "
        "Is it confident? Are more messages than usual going to humans for review? "
        "Are customers satisfied with the replies? If any of these go wrong, "
        "it might mean the robot needs to be retrained or fixed."
    ),
    tip_text="Mentioning concept drift (confidence drops, distribution shifts) shows senior ML engineering knowledge."
)

story += qa_block(
    question="How would you improve the model if given more resources?",
    answer_text=(
        "With a GPU: train on the full 18,000+ example dataset for 3-5 epochs with proper "
        "hyperparameter search (learning rate, batch size). "
        "With more data: collect real customer support tickets, which are messier than the "
        "Bitext dataset and would better reflect production distribution. "
        "Architecturally: (1) implement retrieval-augmented generation — instead of static "
        "prompt templates, retrieve relevant FAQ articles or resolution histories; "
        "(2) add a re-ranking step to select the best candidate response from multiple "
        "LLM generations; (3) implement active learning — flag uncertain predictions, "
        "have humans label them, and retrain periodically."
    ),
    simple_text=(
        "With a proper gaming computer: train the robot on all the data, not just a sample. "
        "With real company data: teach the robot using actual past customer conversations. "
        "With more time: instead of using a fixed template, let the robot look up real "
        "answers from the company's help pages. Like teaching someone to use a real "
        "reference book instead of memorising everything."
    ),
    tip_text="RAG as a next step is a strong answer because it shows architectural thinking beyond fine-tuning."
)

story += qa_block(
    question="What is the cost of running this system at scale?",
    answer_text=(
        "The main cost is the Anthropic API for response generation. At the time of building "
        "this, Claude Sonnet costs approximately $3 per million input tokens and $15 per million "
        "output tokens. A typical support response exchange is ~500 input + ~200 output tokens, "
        "so roughly $0.0045 per resolved query. At 10,000 queries/day that is ~$45/day. "
        "The classifier inference cost is negligible once hosted — DistilBERT runs in ~21ms "
        "per query on CPU. Cost optimisation levers: use Claude Haiku for simple intents "
        "and Sonnet only for complex ones, implement response caching for common queries, "
        "or fine-tune a smaller model as a responder."
    ),
    simple_text=(
        "The expensive part is asking Claude to write each reply — it costs a tiny amount "
        "per reply, but it adds up with millions of customers. The sorting robot is almost "
        "free to run. To save money: use the cheaper AI for easy questions, save common "
        "replies so you only pay once, and use the expensive AI only for tricky problems."
    ),
    tip_text="Showing cost-awareness is impressive — it signals you think like a product engineer, not just a researcher."
)

story.append(PageBreak())

# ===========================================================================
# SECTION 8 — BEHAVIOURAL QUESTIONS
# ===========================================================================
story += sec("Section 8: Behavioural Questions")

story += cat(["STAR-Format Answers"])

story.append(body(
    "Behavioural questions use the STAR format: Situation, Task, Action, Result. "
    "Each answer below is structured this way. Practice saying these out loud."
))
story.append(sp(8))

story += qa_block(
    question="Tell me about a time you had to make a pragmatic decision under constraints.",
    answer_text=(
        "SITUATION: I was implementing the evaluation pipeline and had chosen RAGAS as the "
        "framework. After installation it threw OpenAI API errors despite being configured "
        "with Anthropic. "
        "TASK: I needed working evaluation metrics before I could report any results. "
        "ACTION: I investigated the root cause (RAGAS hardcoded OpenAI for embeddings, "
        "and its async architecture caused timeouts at the API rate limit). I concluded "
        "that patching a third-party library would take longer than building a clean "
        "alternative. I wrote a 100-line synchronous evaluator using Claude Haiku directly. "
        "RESULT: Clean, reproducible evaluation in 50 minutes wall-clock time, equivalent "
        "conceptual metrics, and no external dependencies. The decision to cut scope (drop "
        "the RAGAS library, keep the metric concepts) was the right engineering call."
    ),
    simple_text=(
        "I tried to use a ready-made tool but it was broken for my use case. "
        "I had two choices: spend days fixing the broken tool, or spend one hour building "
        "a simpler version myself. I chose to build my own. It worked perfectly and "
        "I learned more by building it."
    ),
    tip_text="This story shows: debugging skills, engineering judgment, bias for action, and pragmatism. It is one of the best stories in this project."
)

story += qa_block(
    question="Tell me about a time you had to explain something technical to a non-technical person.",
    answer_text=(
        "SITUATION: The confidence threshold concept — why the system escalates to humans — "
        "is technical but has a direct business impact. "
        "TASK: Explain it so a product manager or stakeholder could understand the design decision. "
        "ACTION: I framed it as 'the robot tells you when it's not sure'. I used the analogy "
        "of a new employee who, when unsure, asks their manager rather than guessing. "
        "The 70% threshold means: if the model's certainty is below 70%, a real human "
        "handles the ticket. "
        "RESULT: The stakeholder immediately understood both what the system does and why "
        "the fallback matters for customer experience, without needing to understand "
        "softmax probabilities."
    ),
    simple_text=(
        "I explained that the robot says 'I'm not sure, a person should handle this' when "
        "it's less than 70% confident. Like a new cashier who, when they're unsure about a "
        "return policy, calls their manager rather than guessing and getting it wrong."
    ),
    tip_text="Prepare a non-technical explanation of every key concept. Being able to bridge technical and business language is a senior skill."
)

story += qa_block(
    question="What would you do differently if you started this project again?",
    answer_text=(
        "Three things: First, I would pin all dependency versions immediately in "
        "requirements.txt to avoid breaking changes (like the sklearn multi_class issue). "
        "Second, I would design the evaluation framework before building the pipeline — "
        "knowing I'd need faithfulness and relevancy metrics upfront would have made "
        "me design better output schemas in the pipeline from the start. "
        "Third, I would collect a small real-world test set (actual customer messages from "
        "a live product) rather than splitting the training dataset — this gives a more "
        "honest estimate of production performance."
    ),
    simple_text=(
        "I would: write down exactly which version of every tool I'm using before I start, "
        "plan how I'll test the results BEFORE building the robot (not after), "
        "and use real customer messages for the final test instead of ones from the "
        "same practice dataset."
    ),
    tip_text="Showing genuine reflection, not fake humility ('I would've worked harder') is what recruiters want. These three specific things are credible."
)

story.append(PageBreak())

# ===========================================================================
# SECTION 9 — RAPID FIRE
# ===========================================================================
story += sec("Section 9: Rapid-Fire Questions")

story += cat(["Short, Confident Answers"])

story.append(body(
    "These questions expect a 1-3 sentence answer. Practice answering each in under 20 seconds."
))
story.append(sp(6))

rapid_fire = [
    ("What is a transformer model?",
     "A neural network architecture that uses 'attention' to weigh how important each word "
     "is relative to every other word in a sentence, enabling much better language understanding "
     "than earlier sequential models like LSTMs.",
     "A robot brain that reads a whole sentence at once and figures out which words "
     "are most important based on all the other words around them."),

    ("What is tokenisation?",
     "The process of splitting raw text into subword units (tokens) that the model can process. "
     "DistilBERT uses WordPiece tokenisation, which breaks rare words into common subword pieces "
     "to handle a fixed vocabulary.",
     "Chopping up a sentence into small pieces the robot can understand. 'unbelievable' "
     "might become ['un', '##believ', '##able'] — three pieces."),

    ("What is softmax?",
     "A function that converts a vector of raw scores (logits) into a probability distribution "
     "summing to 1.0. Used as the final layer in classification to produce interpretable confidence scores.",
     "A calculator that takes a list of numbers and converts them into percentages that "
     "add up to 100%. So 'billing: 4.2, login: 0.3' becomes 'billing: 80%, login: 20%'."),

    ("What is overfitting?",
     "When a model memorises the training data so well that it performs poorly on unseen data. "
     "It learns noise and specific examples rather than general patterns.",
     "The robot studied so hard for its practice test that it memorised all the exact "
     "questions. On the real test with different questions, it fails because it memorised "
     "instead of understanding."),

    ("What is the difference between a language model and a classifier?",
     "A language model generates text (predicts the next token). A classifier assigns a "
     "label to an input from a fixed set of categories. DistilBERT here is used as a classifier "
     "(with a classification head), not as a generator. Claude is the language model.",
     "The classifier is like a sorting machine that puts things in boxes. "
     "The language model is like a writer that creates new text. "
     "This project uses both: one to sort, one to write."),

    ("What is knowledge distillation?",
     "A technique where a smaller 'student' model is trained to mimic the outputs of a larger "
     "'teacher' model. DistilBERT was distilled from BERT: the student learns to match BERT's "
     "output distributions, not just the training labels.",
     "Like a wise teacher summarising all their knowledge into a compact book for a student. "
     "The student (DistilBERT) is smaller but very smart because it learned from the big teacher (BERT)."),

    ("What is an epoch?",
     "One full pass through the entire training dataset. Training for 3 epochs means the model "
     "sees every training example 3 times. More epochs can improve performance but risk overfitting.",
     "The robot reading every single practice example once. Three epochs = the robot "
     "read the whole practice book three times."),

    ("What is gradient descent?",
     "An optimisation algorithm that iteratively adjusts model weights in the direction that "
     "reduces the loss function. The learning rate controls the size of each step.",
     "Imagine rolling a ball down a hill to find the lowest point. Gradient descent "
     "is the maths that tells the robot which direction 'downhill' is, so it can improve "
     "its answers little by little."),

    ("What is the Anthropic API?",
     "A REST API provided by Anthropic that allows developers to send messages to Claude models "
     "and receive generated text responses. It requires an API key and is billed per token.",
     "It's a way to talk to Claude (the AI) from your own program. You send a message, "
     "Claude sends back a reply. Like texting, but for code."),

    ("What is a confusion matrix?",
     "A table showing predicted vs actual labels for a classifier. Rows are actual classes, "
     "columns are predicted classes. Diagonal cells are correct predictions; off-diagonal "
     "cells are misclassifications.",
     "A report card showing where the robot gets confused. If it often mixes up "
     "'billing_issue' and 'cancellation_request', those cells will be bright in the table."),
]

for question, answer_full, answer_simple in rapid_fire:
    story += [
        sp(4),
        q(question),
        sp(2),
        a(answer_full),
        sp(2),
        simple(answer_simple),
        sp(4),
        rule()
    ]

story.append(PageBreak())

# ===========================================================================
# SECTION 10 — QUESTIONS TO ASK
# ===========================================================================
story += sec("Section 10: Questions YOU Should Ask the Interviewer")

story += cat(["Show Curiosity & Depth"])

story.append(body(
    "Asking smart questions at the end of an interview shows genuine interest, "
    "seniority, and that you have thought beyond the code. Have at least 3-4 ready."
))
story.append(sp(10))

questions_to_ask = [
    (
        "How do you currently handle intent classification in your customer support pipeline, "
        "and what are the biggest pain points?",
        "This shows you're thinking about real-world application and positioning your skills "
        "against actual problems they face. It also opens a dialogue about how your project "
        "experience is relevant."
    ),
    (
        "What does your model evaluation and monitoring setup look like in production? "
        "How do you detect when a model starts degrading?",
        "This shows you think about the full ML lifecycle — not just training, but "
        "post-deployment health. It's a question a senior ML engineer would ask."
    ),
    (
        "How do you balance automation confidence with the cost of human escalation? "
        "Where do you draw the line between automated response and human review?",
        "This ties directly to your project's confidence threshold design. "
        "It shows you understand the business trade-off, not just the technical one."
    ),
    (
        "What is the main bottleneck in your current NLP/LLM pipeline — is it latency, "
        "accuracy, cost, or something else?",
        "This is a strategic question that shows you understand constraints. "
        "The answer will tell you a lot about the team's priorities."
    ),
    (
        "How do you manage prompt versioning when you update templates that are live in production?",
        "This is a sharp, specific question about LLMOps. Most companies struggle with this "
        "and it shows you have thought about deployment realities beyond just building the model."
    ),
    (
        "How does the team approach handling new intent categories that weren't in the original training set?",
        "This shows you understand model limitations (out-of-distribution inputs) and are "
        "thinking about long-term maintenance."
    ),
]

for i, (q_text, why_text) in enumerate(questions_to_ask, 1):
    block = [
        sp(4),
        Paragraph(f"Question {i}:", CATEGORY_STYLE),
        Paragraph(f'"{q_text}"', ParagraphStyle(
            "QtoAsk", parent=styles["Normal"],
            fontSize=11, leading=16, textColor=colors.HexColor("#0f3460"),
            fontName="Helvetica-BoldOblique", leftIndent=10, spaceAfter=4,
            borderColor=colors.HexColor("#0f3460"), borderWidth=1,
            borderPad=8, backColor=colors.HexColor("#f0f4ff"), borderRadius=4
        )),
        sp(4),
        Paragraph(
            f"Why this works: {why_text}",
            ParagraphStyle(
                "WhyWorks", parent=styles["Normal"],
                fontSize=10, leading=14, textColor=colors.HexColor("#374151"),
                leftIndent=10, spaceAfter=6,
                backColor=colors.HexColor("#f9fafb"),
                borderColor=colors.HexColor("#d1d5db"), borderWidth=0.5,
                borderPad=6
            )
        ),
        sp(4),
        rule()
    ]
    story += block

story.append(PageBreak())

# ===========================================================================
# QUICK REFERENCE CHEAT SHEET
# ===========================================================================
story += sec("Quick Reference — Key Numbers to Remember")

story.append(body(
    "Memorise these numbers. Quoting exact results confidently makes a strong impression."
))
story.append(sp(10))

cheat_sheet_data = [
    ["Metric", "Value", "What It Means"],
    ["Baseline Weighted F1", "0.9958", "TF-IDF + Logistic Regression accuracy"],
    ["DistilBERT Weighted F1", "0.9825", "Fine-tuned transformer accuracy"],
    ["Min per-class F1 (Baseline)", "0.985", "Worst single class performance"],
    ["Min per-class F1 (DistilBERT)", "0.953", "Worst single class performance"],
    ["Answer Relevancy", "0.837 (PASS)", "LLM responses address customer questions"],
    ["Faithfulness", "0.667 (expected low)", "LLM generates beyond the template — intentional"],
    ["Confidence threshold", "0.70", "Below this, route to human review"],
    ["Training data size", "26,872 examples", "Full Bitext dataset"],
    ["CPU training subsample", "3,000 examples", "Adaptive for CPU-only training"],
    ["Training steps (CPU)", "300 steps", "~20 min on CPU"],
    ["Evaluation queries", "50 queries", "RAGAS-style evaluation sample"],
    ["Baseline model size", "0.4 MB", "TF-IDF + LR pickle"],
    ["DistilBERT model size", "4,088 MB", "Fine-tuned transformer weights"],
    ["Baseline inference", "0.15 ms/sample", "Extremely fast"],
    ["DistilBERT inference", "21.18 ms/sample", "140x slower but much more capable"],
    ["Intent categories", "6", "billing, account, technical, inquiry, cancellation, feedback"],
    ["Test set queries (generation)", "200 queries", "Subsampled for LLM generation pipeline"],
]

cheat = Table(cheat_sheet_data, colWidths=[6*cm, 4.5*cm, 6*cm])
cheat.setStyle(TableStyle([
    ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#0f3460")),
    ("TEXTCOLOR", (0, 0), (-1, 0), colors.white),
    ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"),
    ("FONTSIZE", (0, 0), (-1, -1), 9),
    ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#f0f4ff"), colors.white]),
    ("ALIGN", (1, 0), (1, -1), "CENTER"),
    ("VALIGN", (0, 0), (-1, -1), "MIDDLE"),
    ("GRID", (0, 0), (-1, -1), 0.4, colors.HexColor("#dee2e6")),
    ("TOPPADDING", (0, 0), (-1, -1), 6),
    ("BOTTOMPADDING", (0, 0), (-1, -1), 6),
    ("LEFTPADDING", (0, 0), (-1, -1), 6),
    # Highlight the pass/fail rows
    ("TEXTCOLOR", (1, 6), (1, 6), colors.HexColor("#065f46")),
    ("TEXTCOLOR", (1, 7), (1, 7), colors.HexColor("#92400e")),
    ("FONTNAME", (1, 6), (1, 7), "Helvetica-Bold"),
]))
story.append(cheat)
story.append(sp(16))

# Final encouragement
story += [
    HRFlowable(width="100%", thickness=2, color=colors.HexColor("#0f3460"), spaceAfter=12),
    Paragraph("You Built This. Own It.", ParagraphStyle(
        "Final", parent=styles["Normal"],
        fontSize=16, textColor=colors.HexColor("#0f3460"),
        fontName="Helvetica-Bold", alignment=TA_CENTER, spaceAfter=8
    )),
    Paragraph(
        "Every number in that cheat sheet came from code you wrote. "
        "Every decision — from the confidence threshold to the custom evaluator — "
        "was yours. When an interviewer asks about this project, you are the expert "
        "in the room. Speak with confidence.",
        ParagraphStyle(
            "FinalBody", parent=styles["Normal"],
            fontSize=11, leading=17, textColor=colors.HexColor("#374151"),
            alignment=TA_CENTER, spaceAfter=6
        )
    ),
]

# ---------------------------------------------------------------------------
# Build PDF
# ---------------------------------------------------------------------------
doc = SimpleDocTemplate(
    str(OUTPUT),
    pagesize=A4,
    leftMargin=2*cm,
    rightMargin=2*cm,
    topMargin=2.5*cm,
    bottomMargin=2.5*cm,
    title="Interview Prep — Customer Support AI",
    author="Claude Code",
)
doc.build(story)
print(f"PDF saved -> {OUTPUT.resolve()}")