"""Generate a recruiter interview Q&A PDF for the intent classifier project. Covers every likely question a recruiter or technical interviewer would ask, with clear, simple answers explained as if to a 7-year-old — no jargon left unexplained. """ import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from reportlab.lib.pagesizes import A4 from reportlab.lib.styles import getSampleStyleSheet, ParagraphStyle from reportlab.lib.units import cm from reportlab.lib import colors from reportlab.platypus import ( SimpleDocTemplate, Paragraph, Spacer, PageBreak, Table, TableStyle, HRFlowable, KeepTogether ) from reportlab.lib.enums import TA_LEFT, TA_CENTER, TA_JUSTIFY OUTPUT = Path("results/interview_prep.pdf") OUTPUT.parent.mkdir(exist_ok=True) # --------------------------------------------------------------------------- # Styles # --------------------------------------------------------------------------- styles = getSampleStyleSheet() TITLE_STYLE = ParagraphStyle( "ITitle", parent=styles["Title"], fontSize=30, textColor=colors.HexColor("#0f3460"), spaceAfter=12, alignment=TA_CENTER, fontName="Helvetica-Bold" ) SUBTITLE_STYLE = ParagraphStyle( "ISubtitle", parent=styles["Normal"], fontSize=13, textColor=colors.HexColor("#533483"), spaceAfter=8, alignment=TA_CENTER ) COVER_BODY = ParagraphStyle( "ICoverBody", parent=styles["Normal"], fontSize=11, leading=17, textColor=colors.HexColor("#1a1a2e"), alignment=TA_CENTER, spaceAfter=8 ) SECTION_STYLE = ParagraphStyle( "ISection", parent=styles["Heading1"], fontSize=18, textColor=colors.white, spaceBefore=16, spaceAfter=8, backColor=colors.HexColor("#0f3460"), borderPad=8, fontName="Helvetica-Bold" ) CATEGORY_STYLE = ParagraphStyle( "ICategory", parent=styles["Heading2"], fontSize=13, textColor=colors.HexColor("#533483"), spaceBefore=14, spaceAfter=4, fontName="Helvetica-Bold" ) Q_STYLE = ParagraphStyle( "IQuestion", parent=styles["Normal"], fontSize=11, leading=16, textColor=colors.HexColor("#0f3460"), spaceBefore=10, spaceAfter=3, fontName="Helvetica-Bold", backColor=colors.HexColor("#e8f4fd"), borderColor=colors.HexColor("#0f3460"), borderWidth=1, borderPad=7, borderRadius=4, leftIndent=0 ) A_STYLE = ParagraphStyle( "IAnswer", parent=styles["Normal"], fontSize=10, leading=16, textColor=colors.HexColor("#1a1a1a"), spaceBefore=4, spaceAfter=4, alignment=TA_JUSTIFY, leftIndent=8 ) SIMPLE_STYLE = ParagraphStyle( "ISimple", parent=styles["Normal"], fontSize=10, leading=15, textColor=colors.HexColor("#065f46"), spaceBefore=4, spaceAfter=6, backColor=colors.HexColor("#ecfdf5"), borderColor=colors.HexColor("#6ee7b7"), borderWidth=1, borderPad=6, borderRadius=3, leftIndent=8 ) TIP_STYLE = ParagraphStyle( "ITip", parent=styles["Normal"], fontSize=9.5, leading=14, textColor=colors.HexColor("#92400e"), spaceBefore=3, spaceAfter=6, backColor=colors.HexColor("#fffbeb"), borderColor=colors.HexColor("#fcd34d"), borderWidth=1, borderPad=5, leftIndent=8 ) BULLET_STYLE = ParagraphStyle( "IBullet", parent=styles["Normal"], fontSize=10, leading=15, textColor=colors.HexColor("#1a1a1a"), leftIndent=20, spaceAfter=3, bulletIndent=10 ) BODY_STYLE = ParagraphStyle( "IBody", parent=styles["Normal"], fontSize=10, leading=15, textColor=colors.HexColor("#374151"), spaceAfter=5, alignment=TA_JUSTIFY ) # --------------------------------------------------------------------------- # Helpers # --------------------------------------------------------------------------- def sec(title): return [Spacer(1, 10), Paragraph(f" {title}", SECTION_STYLE), Spacer(1, 6)] def cat(title): if isinstance(title, list): title = title[0] return [Paragraph(title, CATEGORY_STYLE), HRFlowable(width="100%", thickness=0.8, color=colors.HexColor("#533483"), spaceAfter=4)] def q(text): return Paragraph(f"Q: {text}", Q_STYLE) def a(text): return Paragraph(text, A_STYLE) def simple(text): return Paragraph(f" Simple version: {text}", SIMPLE_STYLE) def tip(text): return Paragraph(f" Interview Tip: {text}", TIP_STYLE) def bul(text): return Paragraph(f" - {text}", BULLET_STYLE) def body(text): return Paragraph(text, BODY_STYLE) def sp(n=8): return Spacer(1, n) def rule(): return HRFlowable(width="100%", thickness=0.4, color=colors.HexColor("#e5e7eb"), spaceAfter=6) def qa_block(question, answer_text, simple_text="", tip_text="", bullets=None): """One complete Q&A block with optional simple version, tip, and bullets.""" items = [sp(4), q(question), sp(3), a(answer_text)] if bullets: for b in bullets: items.append(bul(b)) if simple_text: items.append(sp(2)) items.append(simple(simple_text)) if tip_text: items.append(sp(2)) items.append(tip(tip_text)) items.append(sp(4)) items.append(rule()) return items # --------------------------------------------------------------------------- # Build story # --------------------------------------------------------------------------- story = [] # ===== COVER PAGE ===== story += [ sp(50), Paragraph("Interview Prep Guide", TITLE_STYLE), Paragraph("Customer Support AI — Intent Classifier Project", SUBTITLE_STYLE), sp(16), Paragraph( "This guide prepares you to answer any question a recruiter or technical interviewer " "might ask about your Customer Support AI project.", COVER_BODY ), sp(8), Paragraph( "Every answer is written twice: once in proper technical language, and once in " "super-simple language — the way you would explain it to a 7-year-old. " "Reading both will make the concept stick.", COVER_BODY ), sp(20), ] # Summary box cover_table = Table( [[ Paragraph("30\nQuestions\nCovered", ParagraphStyle("ct", fontSize=13, alignment=TA_CENTER, textColor=colors.white, fontName="Helvetica-Bold", leading=18)), Paragraph("5\nDifficulty\nLevels", ParagraphStyle("ct2", fontSize=13, alignment=TA_CENTER, textColor=colors.white, fontName="Helvetica-Bold", leading=18)), Paragraph("Simple\nExplanation\nEvery Time", ParagraphStyle("ct3", fontSize=13, alignment=TA_CENTER, textColor=colors.white, fontName="Helvetica-Bold", leading=18)), ]], colWidths=[5*cm, 5*cm, 5*cm] ) cover_table.setStyle(TableStyle([ ("BACKGROUND", (0, 0), (0, 0), colors.HexColor("#0f3460")), ("BACKGROUND", (1, 0), (1, 0), colors.HexColor("#533483")), ("BACKGROUND", (2, 0), (2, 0), colors.HexColor("#2d6a4f")), ("ALIGN", (0, 0), (-1, -1), "CENTER"), ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), ("ROWBACKGROUNDS", (0, 0), (-1, -1), [None]), ("BOX", (0, 0), (-1, -1), 1, colors.white), ("INNERGRID", (0, 0), (-1, -1), 1, colors.white), ("TOPPADDING", (0, 0), (-1, -1), 14), ("BOTTOMPADDING", (0, 0), (-1, -1), 14), ])) story.append(cover_table) story.append(PageBreak()) # ===== TABLE OF CONTENTS ===== story += sec("Table of Contents") toc_data = [ ["Section", "Topic", "Page"], ["1", "The Big Picture — What Did You Build?", "3"], ["2", "The Data — Where Did It Come From?", "5"], ["3", "The Models — How Did You Train Them?", "7"], ["4", "The Pipeline — How Does It All Connect?", "11"], ["5", "Evaluation — How Do You Know It Works?", "13"], ["6", "Challenges & Problem Solving", "16"], ["7", "Production & Real-World Thinking", "18"], ["8", "Behavioural Questions", "21"], ["9", "Rapid-Fire Questions (Short Answers)", "23"], ["10", "Questions YOU Should Ask the Interviewer", "25"], ] toc = Table(toc_data, colWidths=[1.5*cm, 12*cm, 2.5*cm]) toc.setStyle(TableStyle([ ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#0f3460")), ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), ("FONTSIZE", (0, 0), (-1, 0), 10), ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#f8f9fa"), colors.white]), ("FONTSIZE", (0, 1), (-1, -1), 10), ("ALIGN", (0, 0), (0, -1), "CENTER"), ("ALIGN", (2, 0), (2, -1), "CENTER"), ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), ("GRID", (0, 0), (-1, -1), 0.5, colors.HexColor("#dee2e6")), ("TOPPADDING", (0, 0), (-1, -1), 7), ("BOTTOMPADDING", (0, 0), (-1, -1), 7), ("LEFTPADDING", (0, 0), (-1, -1), 8), ])) story += [toc, PageBreak()] # =========================================================================== # SECTION 1 — THE BIG PICTURE # =========================================================================== story += sec("Section 1: The Big Picture — What Did You Build?") story += cat(["Overview Questions"]) story += qa_block( question="Can you give me a 60-second summary of this project?", answer_text=( "I built a two-stage automated customer support system. In stage one, a fine-tuned " "DistilBERT model reads an incoming customer message and classifies it into one of six " "intent categories — things like billing issues, account access problems, or cancellation " "requests. In stage two, the predicted intent is passed as context to Claude (an Anthropic " "LLM), which then generates a helpful, human-sounding support response tailored to that " "specific intent. The system also flags low-confidence predictions for human review. I " "evaluated the full pipeline using a custom LLM-based scoring framework for faithfulness " "and answer relevancy, achieving 0.837 answer relevancy on 50 test queries." ), simple_text=( "Imagine a robot postbox at a company. When a customer sends a message, the robot reads " "it and puts it in one of six boxes — like 'money problems' or 'can't log in'. Then a " "second, smarter robot writes a kind reply based on which box it went into. I built both " "robots and tested how well they work." ), tip_text="Always open with: what it does, how it works, and one key result. This answer does all three." ) story += qa_block( question="Why did you choose this project?", answer_text=( "Customer support automation is a genuine industry problem — companies spend billions " "on support operations and response quality is inconsistent. This project let me practice " "the full ML lifecycle in one place: data engineering, fine-tuning a transformer model, " "prompt engineering with a production LLM, evaluation framework design, and packaging " "everything into a reproducible pipeline. It also demonstrates that I understand both " "classical NLP (TF-IDF baseline) and modern deep learning approaches." ), simple_text=( "Customer support is expensive and slow. I wanted to build something that actually " "saves a company time and money. And it let me practice every important skill in one " "single project — like training for a sports competition by doing every exercise at once." ), tip_text="Show that you understood the business problem, not just the tech. Recruiters love this." ) story += qa_block( question="What are the two stages of the pipeline?", answer_text=( "Stage 1 is the Intent Classifier: a DistilBERT transformer model fine-tuned on labelled " "customer support examples. It reads the raw customer query and outputs a predicted intent " "label plus a confidence score. Stage 2 is the Response Generator: an Anthropic Claude " "model that receives the original query plus a structured prompt template filled with " "intent-specific guidance, and produces a personalised support response. The two stages " "are chained in the SupportAgent class." ), simple_text=( "Stage 1 is the SORTING robot — it reads the message and decides what kind of problem " "it is. Stage 2 is the WRITING robot — it reads the sorted message and writes a nice " "reply. They work together like a post office and a letter writer." ), tip_text="Draw this on a whiteboard if you get the chance. Diagrams make answers memorable." ) story += qa_block( question="What are the 6 intent categories and how did you choose them?", answer_text=( "The six categories are: billing_issue (charges, refunds, payment problems), " "account_access (login, password, account management), technical_support (product " "or service problems, delivery), product_inquiry (information, compatibility, " "warranty), cancellation_request (cancelling orders or subscriptions), and " "general_feedback (complaints, suggestions, general questions). I derived these " "by analysing the Bitext customer support dataset's 50+ granular intent tags and " "grouping them into business-meaningful categories that a real support department " "would use to route tickets." ), simple_text=( "Think of it like sorting your toys into boxes: money box, login box, broken-thing box, " "asking-questions box, I-want-to-quit box, and other box. These six boxes cover almost " "everything a customer could ever message about." ), tip_text="Mention that the categories were business-driven, not just technically convenient. This shows maturity." ) story.append(PageBreak()) # =========================================================================== # SECTION 2 — THE DATA # =========================================================================== story += sec("Section 2: The Data — Where Did It Come From?") story += cat(["Dataset Questions"]) story += qa_block( question="What dataset did you use and why?", answer_text=( "I used the Bitext Customer Support LLM Chatbot Training Dataset from HuggingFace, " "which contains 26,872 labelled customer support utterances across 50+ fine-grained " "intent categories. I chose it because it is publicly available, professionally " "labelled, representative of real support language, and large enough to fine-tune " "a transformer model reliably. It also covers a wide vocabulary of customer phrasings " "for the same intent, which helps the model generalise." ), simple_text=( "I found a big collection of 26,872 real customer messages on the internet. Each " "message already had a label saying what the customer wanted. It's like having a " "giant homework sheet that already has all the answers marked — perfect for teaching " "the robot." ), tip_text="Always know your dataset size, source, and why it was appropriate. These are standard first questions." ) story += qa_block( question="How did you preprocess the data?", answer_text=( "Preprocessing involved three steps: (1) Text cleaning — converting text to lowercase, " "stripping non-ASCII characters, and normalising whitespace using regex. This reduces " "vocabulary noise without removing meaningful content. (2) Label mapping — the Bitext " "dataset has 50+ granular tags which I mapped to my 6 business categories using a " "keyword-based dictionary (LABEL_MAP). Labels that didn't match a keyword got assigned " "via a fallback heuristic. (3) Stratified splitting — I split the data 70/15/15 into " "train/validation/test sets using sklearn's train_test_split with stratify=label, " "ensuring all 6 classes are proportionally represented in every split." ), simple_text=( "I cleaned the messages (made everything lowercase, removed weird characters), " "then sorted the 50+ original label types into my 6 big categories, " "and finally split the data into three piles: a teaching pile, a practice pile, " "and a final exam pile." ), tip_text="Stratified splitting is an important detail that shows you understand class imbalance. Mention it confidently." ) story += qa_block( question="What is stratified splitting and why does it matter?", answer_text=( "Stratified splitting means that when you divide your data into train, validation, " "and test sets, you ensure each set contains the same proportion of each class label " "as the original dataset. Without this, you might accidentally put all examples of a " "rare class into the training set and have none in the test set, making evaluation " "meaningless. sklearn's train_test_split with stratify=y handles this automatically." ), simple_text=( "Imagine you have 10 red balls and 90 blue balls. Stratified splitting means that " "no matter which pile you make, each pile has roughly 10% red and 90% blue. " "If you did it randomly, you might get a pile that's 100% blue and never test " "if the robot can recognise red ones." ), tip_text="This is a classic interview topic. Knowing why it matters (not just what it is) impresses interviewers." ) story += qa_block( question="You mapped 50+ labels to 6. How did you handle ambiguous labels?", answer_text=( "I built a LABEL_MAP dictionary that maps each of the Bitext tags to one of my 6 " "categories using exact string matching. For any tag that wasn't explicitly in the " "dictionary, I applied a keyword fallback: if the tag string contained words like " "'bill', 'charge', or 'payment', it was assigned to billing_issue, and so on for each " "category. This covered the vast majority of cases. About 973 rows used the fallback. " "In a production system, I would review these fallback assignments manually to ensure " "accuracy." ), simple_text=( "I made a lookup table — like a translation dictionary. If a label was in the " "dictionary, I used that translation. If not, I tried to guess from the words in " "the label name. Like if a label said 'billing_adjustment', I could guess it belongs " "in the money/billing box because it contains the word 'billing'." ), tip_text="Acknowledging the 973 fallback rows and saying you'd manually review them shows intellectual honesty." ) story.append(PageBreak()) # =========================================================================== # SECTION 3 — THE MODELS # =========================================================================== story += sec("Section 3: The Models — How Did You Train Them?") story += cat(["Baseline Model Questions"]) story += qa_block( question="You built two classifiers. What is the baseline and why did you build it?", answer_text=( "The baseline is a TF-IDF vectoriser combined with a Logistic Regression classifier, " "implemented as a single sklearn Pipeline. TF-IDF converts each message into a vector " "of numbers representing word importance scores. Logistic Regression then finds the " "linear decision boundary that separates the 6 classes. I built it first because: " "(1) it trains in milliseconds, (2) it provides a performance floor to compare against, " "and (3) it demonstrates that I understand when simpler models are appropriate." ), simple_text=( "Before building the fancy robot, I built a simple one. The simple one counts which " "words appear in a message and uses that to guess the category. It's like a calculator " "vs a smartphone. I built the calculator first to prove the smartphone was actually " "worth building." ), tip_text="Always justify your baseline. Interviewers want to see that you built it deliberately, not as an afterthought." ) story += qa_block( question="What is TF-IDF?", answer_text=( "TF-IDF stands for Term Frequency — Inverse Document Frequency. TF measures how often " "a word appears in one document (high TF = word is frequent in this doc). IDF measures " "how rare the word is across all documents (high IDF = word is unique to few docs). " "Multiplying them gives a score that is high for words that are common in one document " "but rare across the whole dataset — these are the most informative words. Common words " "like 'the' or 'is' get near-zero scores because they appear everywhere." ), simple_text=( "Imagine every word gets a score. A word that appears a lot in just ONE message " "gets a high score — it's special to that message. A word like 'the' that appears " "in every single message gets a low score — it tells us nothing. TF-IDF is just a " "formula for giving each word its specialness score." ), tip_text="TF-IDF is a very common interview question. Learn this definition by heart." ) story += qa_block( question="Your baseline (0.9958 F1) outperformed DistilBERT (0.9825 F1). How do you explain that?", answer_text=( "There are two reasons. First, the dataset itself: the Bitext dataset is professionally " "labelled and uses very consistent, formal language for each intent. TF-IDF word counts " "are perfectly sufficient to separate these clean categories — specific keywords almost " "uniquely identify each class. Second, the training constraint: I was running on CPU " "only, so I subsampled to 3,000 training examples and capped training at 300 steps. " "DistilBERT trained on the full dataset with more epochs would likely match or exceed " "the baseline. The baseline advantage is a dataset characteristic, not evidence that " "DistilBERT is a worse model." ), simple_text=( "The fancy robot did slightly worse because I couldn't let it study for long enough — " "it only had 300 practice rounds instead of thousands. The simple robot was good enough " "for this particular test because the messages in the dataset use very predictable words. " "If we had messier, real-world messages, the fancy robot would win." ), tip_text=( "This is almost guaranteed to come up. Interviewers love testing whether you understand " "your own results. The two-part answer (dataset quality + training constraint) is impressive." ) ) story += cat(["DistilBERT & Fine-Tuning Questions"]) story += qa_block( question="What is DistilBERT and why did you choose it?", answer_text=( "DistilBERT is a smaller, faster version of BERT (Bidirectional Encoder Representations " "from Transformers) created by HuggingFace using a technique called knowledge distillation. " "It retains 97% of BERT's language understanding while being 40% smaller and 60% faster. " "I chose it over full BERT because: (1) I was training on CPU, so speed and memory matter, " "(2) 97% performance retention is sufficient for a classification task, and (3) it is " "a production-proven model with excellent HuggingFace support." ), simple_text=( "BERT is a very smart robot brain that has read millions of books and websites. " "DistilBERT is BERT's younger sibling — 40% smaller, almost just as smart. I picked " "the little sibling because it runs faster on my computer, and for sorting six categories " "of messages, the little sibling is smart enough." ), tip_text="Justify model choice with concrete numbers (40% smaller, 97% performance, 60% faster). Don't just say 'it's popular'." ) story += qa_block( question="What is fine-tuning and what does it mean to fine-tune DistilBERT?", answer_text=( "Fine-tuning means taking a pre-trained model — one that has already learned general " "language understanding from a massive text corpus — and continuing to train it on a " "smaller, task-specific dataset. The pre-trained model already knows grammar, context, " "and word meanings. Fine-tuning teaches it the specifics of your task. For DistilBERT, " "this means: (1) loading the pre-trained weights, (2) adding a classification head " "(a new linear layer that outputs 6 class probabilities), and (3) training the entire " "model end-to-end on the labelled customer support data." ), simple_text=( "Imagine you hire someone who already speaks fluent English and has read every book " "ever written. Fine-tuning is like giving that person a one-week crash course on " "customer support specifically. They already know words and sentences — you just " "teach them your specific job. Much faster than training someone from scratch." ), tip_text="Use the 'pre-trained + task-specific' framing. It's the standard mental model for fine-tuning." ) story += qa_block( question="What is a classification head?", answer_text=( "A classification head is a simple linear layer added on top of a pre-trained model. " "DistilBERT's core outputs a 768-dimensional vector (called the [CLS] token embedding) " "that represents the meaning of the entire input sentence. The classification head " "multiplies this 768-dimensional vector by a weight matrix to produce 6 output " "scores (one per class), then applies softmax to convert them into probabilities. " "During fine-tuning, both the DistilBERT weights and the classification head weights " "are updated." ), simple_text=( "DistilBERT reads a sentence and produces a big list of 768 numbers that summarises " "the meaning. The classification head is like a voting machine — it takes those 768 " "numbers, does some maths, and outputs 6 scores: 'billing: 80%, login: 5%, ...' " "The highest score wins and becomes the prediction." ), tip_text="Knowing the dimension (768) and that softmax converts logits to probabilities is a strong technical detail." ) story += qa_block( question="What hyperparameters did you tune and why?", answer_text=( "Key hyperparameters: learning_rate=2e-5 (standard for BERT fine-tuning; too high " "causes catastrophic forgetting, too low means no learning), max_length=128 tokens " "(sufficient for short support queries, reduces memory), batch_size=16 (balance " "between gradient quality and memory on CPU), max_steps=300 (CPU-adaptive cap to " "complete training in reasonable time), warmup_steps=int(0.1 * max_steps) (prevents " "large gradient updates in early training when weights are random). These are " "standard recommendations from the original BERT paper, adapted for CPU constraints." ), simple_text=( "Hyperparameters are like the settings on an oven before you bake a cake. " "Learning rate is how fast the robot adjusts — too fast and it forgets everything, " "too slow and it never learns. Batch size is how many examples it looks at " "before updating. Warmup steps is a gentle warm-up period, like stretching " "before exercise." ), tip_text=( "Always be able to explain WHY you set each hyperparameter, not just what you set it to. " "'2e-5 is standard for BERT fine-tuning per the original paper' is a strong answer." ) ) story += qa_block( question="How did you handle training on CPU only?", answer_text=( "I implemented automatic hardware detection at the start of training using " "torch.cuda.is_available(). When no GPU is detected, the training script activates " "two adaptive strategies: (1) Data subsampling — it stratified-samples 3,000 examples " "from the full training set rather than training on all 18,000, ensuring all 6 classes " "remain represented; (2) Step capping — it sets max_steps=300 instead of training for " "multiple full epochs. This reduces training time from ~20 hours to ~20 minutes while " "still producing a functional model." ), simple_text=( "Training a big neural network without a GPU is like running a marathon on crutches — " "very slow. So I wrote code that detects 'no GPU found' and automatically switches " "to a faster, smaller version of the training: fewer examples, fewer steps. " "The robot doesn't learn as much, but it learns enough, and it finishes in 20 minutes " "instead of 20 hours." ), tip_text="This shows engineering pragmatism — you adapted to constraints rather than just failing. Interviewers love this." ) story.append(PageBreak()) # =========================================================================== # SECTION 4 — THE PIPELINE # =========================================================================== story += sec("Section 4: The Pipeline — How Does It All Connect?") story += cat(["Architecture Questions"]) story += qa_block( question="Walk me through what happens when a customer sends a message.", answer_text=( "1. The raw customer query arrives at SupportAgent.resolve(). " "2. IntentClassifier.predict() tokenises the text, runs it through DistilBERT, " "and returns the top predicted intent label plus a confidence score (softmax probability). " "3. If confidence is below 0.70, the agent sets requires_human=True and returns a " "flag for human review without calling the LLM. " "4. Otherwise, get_template() fetches the intent-specific prompt template. " "format_user_prompt() fills in the customer query. " "5. ResponseGenerator.generate() sends the system prompt and user prompt to " "Claude via the Anthropic API and receives the generated response. " "6. The agent returns a dict containing the query, intent, confidence, response, " "context, and human_review flag." ), simple_text=( "Step 1: Customer writes a message. Step 2: Robot 1 reads it and decides which " "of 6 boxes it belongs to (and how sure it is). Step 3: If the robot is not sure " "enough (less than 70% confident), it raises a flag and a real human will handle it. " "Step 4: If the robot is sure, it picks the right letter template for that topic. " "Step 5: Robot 2 (Claude) reads the template and writes a personalised reply. " "Step 6: The full reply plus all the details are returned." ), tip_text="Practice saying this as a numbered list out loud. Being able to narrate a system end-to-end is a strong interview skill." ) story += qa_block( question="What is prompt engineering and how did you use it?", answer_text=( "Prompt engineering is the practice of crafting input text to an LLM to guide it " "toward producing a desired output. In this project, I designed 6 intent-specific " "prompt templates, each with a system prompt (setting the LLM's role and tone) and " "a user prompt (providing the customer query plus intent-specific guidance). " "For example, the billing_issue template instructs the model to acknowledge the " "financial concern, show empathy, and offer concrete next steps. This structured " "approach ensures consistent, on-brand responses without requiring the LLM to guess " "the appropriate tone and content." ), simple_text=( "Prompt engineering is writing good instructions for the robot. Instead of just " "saying 'write a reply', I say 'you are a friendly support agent, the customer has " "a billing problem, be empathetic, offer to help fix it'. The better your instructions, " "the better the robot's answer." ), tip_text="Mention that you have 6 separate templates, not one generic one. This shows attention to detail." ) story += qa_block( question="Why does the system flag low-confidence predictions for human review?", answer_text=( "The confidence threshold (0.70) acts as a safety net. When the classifier's softmax " "probability for the top class is below 70%, it indicates the model is uncertain — " "the input may be ambiguous, out-of-distribution, or phrased in a way the model " "hasn't seen. Sending an uncertain intent to the LLM would generate a response built " "on a potentially wrong context, which could mislead or frustrate the customer. " "Flagging for human review prevents poor automated responses from reaching customers " "while still automating the confident majority." ), simple_text=( "Imagine asking the sorting robot 'are you sure?' — if it's less than 70% sure, " "it says 'I'm not confident, a human should handle this one'. This is important " "because if the robot sorts the message into the wrong box, the reply will be " "totally wrong. Better to get a human than to send a bad automated reply." ), tip_text="This shows you designed for real-world use, not just accuracy metrics. Production-readiness thinking." ) story.append(PageBreak()) # =========================================================================== # SECTION 5 — EVALUATION # =========================================================================== story += sec("Section 5: Evaluation — How Do You Know It Works?") story += cat(["Metrics & Evaluation Questions"]) story += qa_block( question="What is weighted F1 score and why did you use it?", answer_text=( "F1 score is the harmonic mean of precision and recall. Precision asks: of all the " "messages I labelled as 'billing_issue', how many actually were? Recall asks: of all " "the actual billing_issue messages, how many did I catch? The harmonic mean penalises " "imbalanced precision/recall more than the arithmetic mean. Weighted F1 averages the " "per-class F1 scores, weighting each class by its number of examples. I chose weighted " "F1 over accuracy because it better handles class imbalance — accuracy alone can be " "misleadingly high if one class dominates." ), simple_text=( "Imagine a test where 90% of questions are easy and 10% are hard. If you only answer " "the easy ones, you score 90% but you're failing on the hard ones. F1 score checks " "BOTH whether your answers are correct AND whether you answered all the questions — " "not just the easy majority." ), tip_text="Knowing why you chose F1 over accuracy is a very common interview question. Always have this answer ready." ) story += qa_block( question="What is RAGAS and how did you use it?", answer_text=( "RAGAS (Retrieval-Augmented Generation Assessment) is an open-source evaluation " "framework originally designed to measure the quality of RAG pipeline outputs. " "It provides metrics including Faithfulness (does the response stay within the " "provided context?) and Answer Relevancy (does the response address the question?). " "I initially attempted to use the RAGAS library but encountered dependency conflicts " "— it required OpenAI embeddings by default. I ultimately implemented the same metrics " "directly using Claude Haiku as the evaluator LLM, bypassing the library while " "preserving the conceptual framework." ), simple_text=( "RAGAS is a tool for grading AI replies. Faithfulness asks: did the robot stick to " "what it was told, or did it make things up? Answer Relevancy asks: did the robot " "actually answer the question? I tried using the RAGAS tool but it had technical " "problems, so I built my own version that does the same grading." ), tip_text="Be upfront about the dependency issue and your workaround. Showing problem-solving is better than hiding struggles." ) story += qa_block( question="Your faithfulness score was 0.667, below the 0.85 target. Is that a failure?", answer_text=( "Not in this context. Faithfulness in RAGAS measures whether the generated response " "is grounded in the provided context document. In a RAG system with a knowledge base, " "a low faithfulness score means the model hallucinated facts. But in this system, " "the 'context' is a prompt template with minimal content — it contains guidance and " "tone instructions, not a database of facts. Claude is expected to generate helpful " "domain knowledge (like explaining billing processes) that is not literally in the " "template. This is correct, desirable behaviour. The more meaningful metric here is " "Answer Relevancy (0.837), which passed its target of 0.80." ), simple_text=( "Faithfulness is like asking 'did the robot only use words from the instruction card?' " "But our instruction card only has general guidelines, not specific facts. So when " "the robot adds helpful details (like how to reset a password), it 'fails' faithfulness " "even though its answer was actually great. The more important score — did it answer " "the right question? — passed with 0.837." ), tip_text=( "This is the most nuanced result in the project. Interviewers who see the 0.667 will " "test you on it. Have this explanation ready and be confident — you are NOT making excuses, " "you are correctly identifying a metric limitation." ) ) story += qa_block( question="How did you evaluate the LLM-generated responses?", answer_text=( "I implemented a custom synchronous evaluator using Claude Haiku as the judge LLM. " "For each of the 50 test responses, I sent two evaluation prompts to Claude Haiku: " "one asking it to score faithfulness (0.0-1.0) and one asking it to score answer " "relevancy (0.0-1.0). Each prompt asked for only a single decimal number in the reply " "(max_tokens=10, temperature=0 for determinism). I then computed mean, median, std, " "min, and max across all 50 scores. Results were saved to results/ragas_scores.json." ), simple_text=( "I used a second AI (Claude Haiku) to grade the first AI's answers. For each answer, " "I asked Haiku two questions: 'How well does this answer stick to the topic? Score " "0 to 1' and 'How well does this answer address what the customer asked? Score 0 to 1'. " "Then I averaged all 50 scores to get the final grade." ), tip_text="LLM-as-judge evaluation is a hot topic in 2024-2026. Knowing why you use temperature=0 for evaluation (reproducibility) is a great detail." ) story += qa_block( question="What is the difference between precision and recall?", answer_text=( "Precision: of everything the model labelled as class X, what fraction actually is X? " "High precision = few false positives. Recall: of everything that actually is class X, " "what fraction did the model correctly identify? High recall = few false negatives. " "There is usually a trade-off: tuning for higher recall means accepting more false " "positives, and vice versa. The right balance depends on the cost of each error type. " "In a medical diagnosis context, high recall (catch all real cases) matters more. " "In a spam filter, high precision (don't block real emails) matters more." ), simple_text=( "Precision: if the robot says 'this is a cat', how often is it actually a cat? " "Recall: of all the real cats, how many did the robot notice? " "A robot that calls everything a cat has perfect recall (it never misses a cat) " "but terrible precision (most of what it calls cats are dogs). " "You need both to be good." ), tip_text="The medical/spam example is a classic way to make precision/recall trade-offs concrete. Use it." ) story.append(PageBreak()) # =========================================================================== # SECTION 6 — CHALLENGES # =========================================================================== story += sec("Section 6: Challenges & Problem Solving") story += cat(["'Tell Me About a Challenge' Questions"]) story += qa_block( question="What was the hardest technical problem you faced and how did you solve it?", answer_text=( "The most significant challenge was the RAGAS evaluation framework's hard dependency " "on OpenAI. After installing RAGAS and configuring the Anthropic LLM wrapper, the " "library still tried to call OpenAI for embedding-based metrics. Attempts to swap " "in HuggingFace embeddings via LangchainEmbeddingsWrapper also failed due to RAGAS's " "internal async timeout handling. Rather than spending hours debugging a third-party " "library, I made the decision to implement the same conceptual metrics — faithfulness " "and answer relevancy — as a direct, synchronous Anthropic API loop. This removed " "the dependency entirely, eliminated the async timeout issue, and produced cleaner, " "more interpretable results." ), simple_text=( "I tried to use a ready-made grading tool (RAGAS) but it secretly required a " "different AI service (OpenAI) that I wasn't using. No matter what I tried, " "it kept asking for that service. So instead of fighting it, I built my own " "grading tool from scratch in 100 lines of code. My version was actually simpler " "and worked better." ), tip_text="This answer shows debugging skill, good judgment (knowing when to stop debugging), and resourcefulness. Lead with the challenge, end with the solution." ) story += qa_block( question="How did you deal with the slow CPU training problem?", answer_text=( "The naive training run would have taken 20+ hours on CPU — clearly impractical. " "I solved it with two changes: (1) Automatic detection — the code checks " "torch.cuda.is_available() and activates 'CPU mode' when no GPU is found. " "(2) Adaptive parameters — in CPU mode, training data is stratified-subsampled " "to 3,000 examples and max_steps is capped at 300. This reduces training time to " "~20 minutes while still producing a model with 0.9825 F1, which proves the approach " "is sound. The config file exposes cpu_train_sample and cpu_max_steps as tunable " "parameters so they can be adjusted." ), simple_text=( "Training the robot normally would take 20 hours without a special graphics card. " "I wrote code that detects the slow computer and automatically switches to a " "faster mini-training mode: less data, fewer rounds. The robot doesn't become " "as expert, but it still gets a 98.25% score, which proves the idea works. " "It's like practicing for a marathon by running 5km — you prove you can run, " "even if you haven't run the full 42km yet." ), tip_text="Framing this as intentional engineering (not a workaround) is important. You made a pragmatic trade-off, not a mistake." ) story += qa_block( question="sklearn 1.8 removed the multi_class parameter. How did you handle a breaking change?", answer_text=( "When I ran the baseline training script, it threw a TypeError: " "LogisticRegression.__init__() got an unexpected keyword argument 'multi_class'. " "This is because sklearn 1.8 removed the deprecated multi_class='multinomial' " "parameter. The fix was simple — remove the parameter from both the code and config. " "Modern sklearn's LogisticRegression automatically handles multiclass problems using " "the one-vs-rest scheme by default, which produces equivalent results. This was a " "lesson in keeping requirements pinned in production to prevent unexpected breakage." ), simple_text=( "A tool I was using (sklearn) got an update that removed a setting I was using. " "The computer gave me an error saying it didn't recognise that setting anymore. " "I looked it up and found out the new version doesn't need that setting — it " "figures it out automatically. So I deleted that line of code and everything worked. " "Lesson learned: always write down exactly which version of each tool you're using." ), tip_text="Handling a library breaking change gracefully and learning from it is a great story for a behavioural question." ) story.append(PageBreak()) # =========================================================================== # SECTION 7 — PRODUCTION THINKING # =========================================================================== story += sec("Section 7: Production & Real-World Thinking") story += cat(["Scalability & Production Questions"]) story += qa_block( question="How would you deploy this system in production?", answer_text=( "A production deployment would involve: (1) Serving the classifier as a REST API " "using FastAPI, with the model loaded into memory at startup and a /predict endpoint. " "(2) Containerising with Docker so the model and all dependencies are portable. " "(3) Deploying to a cloud provider (AWS, GCP, or Azure) with auto-scaling based on " "request volume. (4) Implementing a message queue (e.g. SQS or Kafka) if volume is " "high, so requests are processed asynchronously. (5) Caching the LLM response for " "duplicate or near-duplicate queries to reduce Anthropic API costs. " "(6) Adding monitoring/logging (latency, error rate, intent distribution) with tools " "like Prometheus/Grafana or Datadog." ), simple_text=( "To put this in a real company, I would: wrap it in a web address so other apps " "can call it, package it in a box (Docker) so it runs anywhere, put it on a cloud " "computer that can grow bigger when more people use it, save common replies so we " "don't call the expensive AI every time, and add a dashboard showing how well it's " "working every day." ), tip_text="Even if you haven't deployed it, showing you KNOW how to deploy it is enough. Mention FastAPI, Docker, and monitoring." ) story += qa_block( question="How would you monitor this system once deployed?", answer_text=( "Monitoring would cover three layers: (1) Infrastructure metrics — latency, error rate, " "throughput (standard APM). (2) ML metrics — intent distribution drift (if billing_issue " "suddenly spikes, something changed), average confidence score over time (confidence drop " "may indicate the model is seeing new types of queries it wasn't trained on), and " "human_review escalation rate. (3) Business metrics — customer satisfaction, resolution " "time, re-contact rate. I would also implement periodic re-evaluation: run new queries " "through the LLM judge and alert if relevancy drops below threshold." ), simple_text=( "Monitoring is like a health check for the robot. I'd watch: is it fast enough? " "Is it confident? Are more messages than usual going to humans for review? " "Are customers satisfied with the replies? If any of these go wrong, " "it might mean the robot needs to be retrained or fixed." ), tip_text="Mentioning concept drift (confidence drops, distribution shifts) shows senior ML engineering knowledge." ) story += qa_block( question="How would you improve the model if given more resources?", answer_text=( "With a GPU: train on the full 18,000+ example dataset for 3-5 epochs with proper " "hyperparameter search (learning rate, batch size). " "With more data: collect real customer support tickets, which are messier than the " "Bitext dataset and would better reflect production distribution. " "Architecturally: (1) implement retrieval-augmented generation — instead of static " "prompt templates, retrieve relevant FAQ articles or resolution histories; " "(2) add a re-ranking step to select the best candidate response from multiple " "LLM generations; (3) implement active learning — flag uncertain predictions, " "have humans label them, and retrain periodically." ), simple_text=( "With a proper gaming computer: train the robot on all the data, not just a sample. " "With real company data: teach the robot using actual past customer conversations. " "With more time: instead of using a fixed template, let the robot look up real " "answers from the company's help pages. Like teaching someone to use a real " "reference book instead of memorising everything." ), tip_text="RAG as a next step is a strong answer because it shows architectural thinking beyond fine-tuning." ) story += qa_block( question="What is the cost of running this system at scale?", answer_text=( "The main cost is the Anthropic API for response generation. At the time of building " "this, Claude Sonnet costs approximately $3 per million input tokens and $15 per million " "output tokens. A typical support response exchange is ~500 input + ~200 output tokens, " "so roughly $0.0045 per resolved query. At 10,000 queries/day that is ~$45/day. " "The classifier inference cost is negligible once hosted — DistilBERT runs in ~21ms " "per query on CPU. Cost optimisation levers: use Claude Haiku for simple intents " "and Sonnet only for complex ones, implement response caching for common queries, " "or fine-tune a smaller model as a responder." ), simple_text=( "The expensive part is asking Claude to write each reply — it costs a tiny amount " "per reply, but it adds up with millions of customers. The sorting robot is almost " "free to run. To save money: use the cheaper AI for easy questions, save common " "replies so you only pay once, and use the expensive AI only for tricky problems." ), tip_text="Showing cost-awareness is impressive — it signals you think like a product engineer, not just a researcher." ) story.append(PageBreak()) # =========================================================================== # SECTION 8 — BEHAVIOURAL QUESTIONS # =========================================================================== story += sec("Section 8: Behavioural Questions") story += cat(["STAR-Format Answers"]) story.append(body( "Behavioural questions use the STAR format: Situation, Task, Action, Result. " "Each answer below is structured this way. Practice saying these out loud." )) story.append(sp(8)) story += qa_block( question="Tell me about a time you had to make a pragmatic decision under constraints.", answer_text=( "SITUATION: I was implementing the evaluation pipeline and had chosen RAGAS as the " "framework. After installation it threw OpenAI API errors despite being configured " "with Anthropic. " "TASK: I needed working evaluation metrics before I could report any results. " "ACTION: I investigated the root cause (RAGAS hardcoded OpenAI for embeddings, " "and its async architecture caused timeouts at the API rate limit). I concluded " "that patching a third-party library would take longer than building a clean " "alternative. I wrote a 100-line synchronous evaluator using Claude Haiku directly. " "RESULT: Clean, reproducible evaluation in 50 minutes wall-clock time, equivalent " "conceptual metrics, and no external dependencies. The decision to cut scope (drop " "the RAGAS library, keep the metric concepts) was the right engineering call." ), simple_text=( "I tried to use a ready-made tool but it was broken for my use case. " "I had two choices: spend days fixing the broken tool, or spend one hour building " "a simpler version myself. I chose to build my own. It worked perfectly and " "I learned more by building it." ), tip_text="This story shows: debugging skills, engineering judgment, bias for action, and pragmatism. It is one of the best stories in this project." ) story += qa_block( question="Tell me about a time you had to explain something technical to a non-technical person.", answer_text=( "SITUATION: The confidence threshold concept — why the system escalates to humans — " "is technical but has a direct business impact. " "TASK: Explain it so a product manager or stakeholder could understand the design decision. " "ACTION: I framed it as 'the robot tells you when it's not sure'. I used the analogy " "of a new employee who, when unsure, asks their manager rather than guessing. " "The 70% threshold means: if the model's certainty is below 70%, a real human " "handles the ticket. " "RESULT: The stakeholder immediately understood both what the system does and why " "the fallback matters for customer experience, without needing to understand " "softmax probabilities." ), simple_text=( "I explained that the robot says 'I'm not sure, a person should handle this' when " "it's less than 70% confident. Like a new cashier who, when they're unsure about a " "return policy, calls their manager rather than guessing and getting it wrong." ), tip_text="Prepare a non-technical explanation of every key concept. Being able to bridge technical and business language is a senior skill." ) story += qa_block( question="What would you do differently if you started this project again?", answer_text=( "Three things: First, I would pin all dependency versions immediately in " "requirements.txt to avoid breaking changes (like the sklearn multi_class issue). " "Second, I would design the evaluation framework before building the pipeline — " "knowing I'd need faithfulness and relevancy metrics upfront would have made " "me design better output schemas in the pipeline from the start. " "Third, I would collect a small real-world test set (actual customer messages from " "a live product) rather than splitting the training dataset — this gives a more " "honest estimate of production performance." ), simple_text=( "I would: write down exactly which version of every tool I'm using before I start, " "plan how I'll test the results BEFORE building the robot (not after), " "and use real customer messages for the final test instead of ones from the " "same practice dataset." ), tip_text="Showing genuine reflection, not fake humility ('I would've worked harder') is what recruiters want. These three specific things are credible." ) story.append(PageBreak()) # =========================================================================== # SECTION 9 — RAPID FIRE # =========================================================================== story += sec("Section 9: Rapid-Fire Questions") story += cat(["Short, Confident Answers"]) story.append(body( "These questions expect a 1-3 sentence answer. Practice answering each in under 20 seconds." )) story.append(sp(6)) rapid_fire = [ ("What is a transformer model?", "A neural network architecture that uses 'attention' to weigh how important each word " "is relative to every other word in a sentence, enabling much better language understanding " "than earlier sequential models like LSTMs.", "A robot brain that reads a whole sentence at once and figures out which words " "are most important based on all the other words around them."), ("What is tokenisation?", "The process of splitting raw text into subword units (tokens) that the model can process. " "DistilBERT uses WordPiece tokenisation, which breaks rare words into common subword pieces " "to handle a fixed vocabulary.", "Chopping up a sentence into small pieces the robot can understand. 'unbelievable' " "might become ['un', '##believ', '##able'] — three pieces."), ("What is softmax?", "A function that converts a vector of raw scores (logits) into a probability distribution " "summing to 1.0. Used as the final layer in classification to produce interpretable confidence scores.", "A calculator that takes a list of numbers and converts them into percentages that " "add up to 100%. So 'billing: 4.2, login: 0.3' becomes 'billing: 80%, login: 20%'."), ("What is overfitting?", "When a model memorises the training data so well that it performs poorly on unseen data. " "It learns noise and specific examples rather than general patterns.", "The robot studied so hard for its practice test that it memorised all the exact " "questions. On the real test with different questions, it fails because it memorised " "instead of understanding."), ("What is the difference between a language model and a classifier?", "A language model generates text (predicts the next token). A classifier assigns a " "label to an input from a fixed set of categories. DistilBERT here is used as a classifier " "(with a classification head), not as a generator. Claude is the language model.", "The classifier is like a sorting machine that puts things in boxes. " "The language model is like a writer that creates new text. " "This project uses both: one to sort, one to write."), ("What is knowledge distillation?", "A technique where a smaller 'student' model is trained to mimic the outputs of a larger " "'teacher' model. DistilBERT was distilled from BERT: the student learns to match BERT's " "output distributions, not just the training labels.", "Like a wise teacher summarising all their knowledge into a compact book for a student. " "The student (DistilBERT) is smaller but very smart because it learned from the big teacher (BERT)."), ("What is an epoch?", "One full pass through the entire training dataset. Training for 3 epochs means the model " "sees every training example 3 times. More epochs can improve performance but risk overfitting.", "The robot reading every single practice example once. Three epochs = the robot " "read the whole practice book three times."), ("What is gradient descent?", "An optimisation algorithm that iteratively adjusts model weights in the direction that " "reduces the loss function. The learning rate controls the size of each step.", "Imagine rolling a ball down a hill to find the lowest point. Gradient descent " "is the maths that tells the robot which direction 'downhill' is, so it can improve " "its answers little by little."), ("What is the Anthropic API?", "A REST API provided by Anthropic that allows developers to send messages to Claude models " "and receive generated text responses. It requires an API key and is billed per token.", "It's a way to talk to Claude (the AI) from your own program. You send a message, " "Claude sends back a reply. Like texting, but for code."), ("What is a confusion matrix?", "A table showing predicted vs actual labels for a classifier. Rows are actual classes, " "columns are predicted classes. Diagonal cells are correct predictions; off-diagonal " "cells are misclassifications.", "A report card showing where the robot gets confused. If it often mixes up " "'billing_issue' and 'cancellation_request', those cells will be bright in the table."), ] for question, answer_full, answer_simple in rapid_fire: story += [ sp(4), q(question), sp(2), a(answer_full), sp(2), simple(answer_simple), sp(4), rule() ] story.append(PageBreak()) # =========================================================================== # SECTION 10 — QUESTIONS TO ASK # =========================================================================== story += sec("Section 10: Questions YOU Should Ask the Interviewer") story += cat(["Show Curiosity & Depth"]) story.append(body( "Asking smart questions at the end of an interview shows genuine interest, " "seniority, and that you have thought beyond the code. Have at least 3-4 ready." )) story.append(sp(10)) questions_to_ask = [ ( "How do you currently handle intent classification in your customer support pipeline, " "and what are the biggest pain points?", "This shows you're thinking about real-world application and positioning your skills " "against actual problems they face. It also opens a dialogue about how your project " "experience is relevant." ), ( "What does your model evaluation and monitoring setup look like in production? " "How do you detect when a model starts degrading?", "This shows you think about the full ML lifecycle — not just training, but " "post-deployment health. It's a question a senior ML engineer would ask." ), ( "How do you balance automation confidence with the cost of human escalation? " "Where do you draw the line between automated response and human review?", "This ties directly to your project's confidence threshold design. " "It shows you understand the business trade-off, not just the technical one." ), ( "What is the main bottleneck in your current NLP/LLM pipeline — is it latency, " "accuracy, cost, or something else?", "This is a strategic question that shows you understand constraints. " "The answer will tell you a lot about the team's priorities." ), ( "How do you manage prompt versioning when you update templates that are live in production?", "This is a sharp, specific question about LLMOps. Most companies struggle with this " "and it shows you have thought about deployment realities beyond just building the model." ), ( "How does the team approach handling new intent categories that weren't in the original training set?", "This shows you understand model limitations (out-of-distribution inputs) and are " "thinking about long-term maintenance." ), ] for i, (q_text, why_text) in enumerate(questions_to_ask, 1): block = [ sp(4), Paragraph(f"Question {i}:", CATEGORY_STYLE), Paragraph(f'"{q_text}"', ParagraphStyle( "QtoAsk", parent=styles["Normal"], fontSize=11, leading=16, textColor=colors.HexColor("#0f3460"), fontName="Helvetica-BoldOblique", leftIndent=10, spaceAfter=4, borderColor=colors.HexColor("#0f3460"), borderWidth=1, borderPad=8, backColor=colors.HexColor("#f0f4ff"), borderRadius=4 )), sp(4), Paragraph( f"Why this works: {why_text}", ParagraphStyle( "WhyWorks", parent=styles["Normal"], fontSize=10, leading=14, textColor=colors.HexColor("#374151"), leftIndent=10, spaceAfter=6, backColor=colors.HexColor("#f9fafb"), borderColor=colors.HexColor("#d1d5db"), borderWidth=0.5, borderPad=6 ) ), sp(4), rule() ] story += block story.append(PageBreak()) # =========================================================================== # QUICK REFERENCE CHEAT SHEET # =========================================================================== story += sec("Quick Reference — Key Numbers to Remember") story.append(body( "Memorise these numbers. Quoting exact results confidently makes a strong impression." )) story.append(sp(10)) cheat_sheet_data = [ ["Metric", "Value", "What It Means"], ["Baseline Weighted F1", "0.9958", "TF-IDF + Logistic Regression accuracy"], ["DistilBERT Weighted F1", "0.9825", "Fine-tuned transformer accuracy"], ["Min per-class F1 (Baseline)", "0.985", "Worst single class performance"], ["Min per-class F1 (DistilBERT)", "0.953", "Worst single class performance"], ["Answer Relevancy", "0.837 (PASS)", "LLM responses address customer questions"], ["Faithfulness", "0.667 (expected low)", "LLM generates beyond the template — intentional"], ["Confidence threshold", "0.70", "Below this, route to human review"], ["Training data size", "26,872 examples", "Full Bitext dataset"], ["CPU training subsample", "3,000 examples", "Adaptive for CPU-only training"], ["Training steps (CPU)", "300 steps", "~20 min on CPU"], ["Evaluation queries", "50 queries", "RAGAS-style evaluation sample"], ["Baseline model size", "0.4 MB", "TF-IDF + LR pickle"], ["DistilBERT model size", "4,088 MB", "Fine-tuned transformer weights"], ["Baseline inference", "0.15 ms/sample", "Extremely fast"], ["DistilBERT inference", "21.18 ms/sample", "140x slower but much more capable"], ["Intent categories", "6", "billing, account, technical, inquiry, cancellation, feedback"], ["Test set queries (generation)", "200 queries", "Subsampled for LLM generation pipeline"], ] cheat = Table(cheat_sheet_data, colWidths=[6*cm, 4.5*cm, 6*cm]) cheat.setStyle(TableStyle([ ("BACKGROUND", (0, 0), (-1, 0), colors.HexColor("#0f3460")), ("TEXTCOLOR", (0, 0), (-1, 0), colors.white), ("FONTNAME", (0, 0), (-1, 0), "Helvetica-Bold"), ("FONTSIZE", (0, 0), (-1, -1), 9), ("ROWBACKGROUNDS", (0, 1), (-1, -1), [colors.HexColor("#f0f4ff"), colors.white]), ("ALIGN", (1, 0), (1, -1), "CENTER"), ("VALIGN", (0, 0), (-1, -1), "MIDDLE"), ("GRID", (0, 0), (-1, -1), 0.4, colors.HexColor("#dee2e6")), ("TOPPADDING", (0, 0), (-1, -1), 6), ("BOTTOMPADDING", (0, 0), (-1, -1), 6), ("LEFTPADDING", (0, 0), (-1, -1), 6), # Highlight the pass/fail rows ("TEXTCOLOR", (1, 6), (1, 6), colors.HexColor("#065f46")), ("TEXTCOLOR", (1, 7), (1, 7), colors.HexColor("#92400e")), ("FONTNAME", (1, 6), (1, 7), "Helvetica-Bold"), ])) story.append(cheat) story.append(sp(16)) # Final encouragement story += [ HRFlowable(width="100%", thickness=2, color=colors.HexColor("#0f3460"), spaceAfter=12), Paragraph("You Built This. Own It.", ParagraphStyle( "Final", parent=styles["Normal"], fontSize=16, textColor=colors.HexColor("#0f3460"), fontName="Helvetica-Bold", alignment=TA_CENTER, spaceAfter=8 )), Paragraph( "Every number in that cheat sheet came from code you wrote. " "Every decision — from the confidence threshold to the custom evaluator — " "was yours. When an interviewer asks about this project, you are the expert " "in the room. Speak with confidence.", ParagraphStyle( "FinalBody", parent=styles["Normal"], fontSize=11, leading=17, textColor=colors.HexColor("#374151"), alignment=TA_CENTER, spaceAfter=6 ) ), ] # --------------------------------------------------------------------------- # Build PDF # --------------------------------------------------------------------------- doc = SimpleDocTemplate( str(OUTPUT), pagesize=A4, leftMargin=2*cm, rightMargin=2*cm, topMargin=2.5*cm, bottomMargin=2.5*cm, title="Interview Prep — Customer Support AI", author="Claude Code", ) doc.build(story) print(f"PDF saved -> {OUTPUT.resolve()}")