diff --git "a/Python/app.js" "b/Python/app.js" new file mode 100644--- /dev/null +++ "b/Python/app.js" @@ -0,0 +1,2239 @@ +const modules = [ + { + id: "python-fundamentals", + title: "Python Fundamentals for DS", + icon: "🐍", + category: "Foundations", + description: "Data structures, comprehensions, file I/O, virtual environments" + }, + { + id: "numpy", + title: "NumPy & Scientific Computing", + icon: "πŸ”’", + category: "Scientific", + description: "ndarrays, broadcasting, vectorization, linear algebra" + }, + { + id: "pandas", + title: "Pandas & Data Manipulation", + icon: "🐼", + category: "Data Wrangling", + description: "DataFrames, groupby, pivot, time series, merging" + }, + { + id: "visualization", + title: "Data Visualization", + icon: "πŸ“Š", + category: "Visualization", + description: "Matplotlib, Seaborn, Plotly β€” from basics to publication-ready" + }, + { + id: "advanced-python", + title: "Advanced Python", + icon: "🎯", + category: "Advanced", + description: "OOP, decorators, async, multiprocessing, type hints" + }, + { + id: "sklearn", + title: "Python for ML (Scikit-learn)", + icon: "πŸ€–", + category: "Machine Learning", + description: "Pipelines, transformers, cross-validation, hyperparameter tuning" + }, + { + id: "pytorch", + title: "Deep Learning with PyTorch", + icon: "πŸ”₯", + category: "Deep Learning", + description: "Tensors, autograd, nn.Module, training loops, transfer learning" + }, + { + id: "tensorflow", + title: "TensorFlow & Keras", + icon: "🧠", + category: "Deep Learning", + description: "Sequential/Functional API, callbacks, TensorBoard, deployment" + }, + { + id: "production", + title: "Production Python", + icon: "πŸ“¦", + category: "Engineering", + description: "Testing, packaging, logging, FastAPI for model serving" + }, + { + id: "optimization", + title: "Performance & Optimization", + icon: "⚑", + category: "Optimization", + description: "Profiling, Numba, Cython, memory optimization, Dask" + } +]; + + +const MODULE_CONTENT = { + "python-fundamentals": { + concepts: ` +
+

🐍 Python Fundamentals β€” Complete Deep Dive

+ +
+
⚑ Python Is Not What You Think
+
Python is a dynamically-typed, garbage-collected, interpreted language with a C-based runtime (CPython). Everything is an object β€” integers, functions, even classes. Understanding this object model is what separates beginners from professionals.
+
+ +

1. Data Structures β€” Complete Reference

+ + + + + + + + + + +
TypeMutableOrderedHashableUse Case
listβœ“βœ“βœ—Sequential data, time series, feature lists
tupleβœ—βœ“βœ“Fixed records, dict keys, DataFrame rows
dictβœ“βœ“ (3.7+)βœ—Lookup tables, JSON, config, caches
setβœ“βœ—βœ—Unique values, membership testing O(1)
frozensetβœ—βœ—βœ“Immutable set, usable as dict keys
dequeβœ“βœ“βœ—O(1) append/pop both ends, sliding windows
bytesβœ—βœ“βœ“Binary data, serialization, network I/O
bytearrayβœ“βœ“βœ—Mutable binary buffers
+ +

2. Time Complexity β€” What Every Dev Must Know

+ + + + + + + + +
Operationlistdictset
Lookup by index/keyO(1)O(1)β€”
Search (x in ...)O(n)O(1)O(1)
Insert/AppendO(1) end, O(n) middleO(1)O(1)
DeleteO(n)O(1)O(1)
SortO(n log n)β€”β€”
IterationO(n)O(n)O(n)
+

Real-world impact: Checking if an item exists in a list of 1M elements = ~50ms. In a set = ~0.00005ms. That's 1,000,000x faster. Always use sets/dicts for membership testing.

+ +

3. Python Memory Model

+
+
⚑ Everything Is An Object on the Heap
+
Variables are references (pointers), not boxes. a = [1,2,3] creates a list on the heap; a points to it. b = a makes both point to the same list. This is aliasing β€” the #1 source of bugs in beginner Python code.
+
+

Reference Counting: Each object tracks how many names reference it. When count = 0, freed immediately. del decrements the count, doesn't necessarily free memory.

+

Integer Interning: Python caches integers -5 to 256. So a = 100; b = 100; a is b β†’ True. But a = 1000; b = 1000; a is b β†’ may be False. Never use is for value comparison.

+

Garbage Collection: 3 generations (gen0, gen1, gen2). New objects in gen0. Survivors promoted. Use gc.collect() after deleting large ML models.

+ +

4. Generators & Iterators β€” The Heart of Python

+
+
πŸ”„ Lazy Evaluation
+
yield suspends state, return terminates. A list of 1B items = ~8GB. A generator = ~100 bytes. The Iterator Protocol: any object with __iter__ + __next__. Generator expressions: (x**2 for x in range(10**9)) β€” O(1) memory.
+
+

yield from: Delegates to sub-generator. Forwards send() and throw(). Essential for building composable data pipelines.

+

send(): Two-way communication with generators (coroutines). value = yield result β€” both receives and produces values.

+ +

5. Closures & First-Class Functions

+

Functions are first-class objects β€” passed as args, returned, assigned. A closure captures variables from enclosing scope. Foundation of decorators, callbacks, and functional programming.

+ +

6. Critical Python Gotchas for Projects

+
+
⚠️ The 5 Deadliest Python Traps
+ 1. Mutable Default Args: def f(x, lst=[]): β€” list shared across ALL calls. Fix: lst=None.
+ 2. Late Binding Closures: [lambda: i for i in range(5)] β€” all return 4! Fix: lambda i=i: i.
+ 3. Shallow Copy: list(a) copies outer list but shares inner objects.
+ 4. String Concatenation: s += "text" in a loop creates new string every time β€” O(nΒ²). Use ''.join(parts).
+ 5. Circular Imports: Module A imports B, B imports A β†’ ImportError. Fix: restructure or lazy import. +
+ +

7. Error Handling for Production Projects

+
+
πŸ›‘οΈ Exception Hierarchy You Must Know
+
+ BaseException β†’ Exception (catch this) β†’ ValueError, TypeError, KeyError, FileNotFoundError, ConnectionError...
+ Rules: (1) Never catch bare except:. (2) Catch specific exceptions. (3) Use else for success path. (4) finally always runs. (5) Create custom exceptions for your project. +
+
+ +

8. collections Module β€” Power Tools

+ + + + + + + + +
ClassPurposeProject Use Case
defaultdictDict with default factoryGroup data: defaultdict(list)
CounterCount hashable objectsLabel distribution, word frequency
namedtupleLightweight immutable classReturn multiple named values
dequeDouble-ended queueSliding window, BFS, ring buffer
ChainMapStack multiple dictsConfig layers: defaults β†’ env β†’ CLI
OrderedDictOrdered dict (legacy)move_to_end() for LRU cache
+ +

9. itertools β€” Memory-Efficient Pipelines

+ + + + + + + + + + +
FunctionWhat It DoesProject Use
chain()Concatenate iterables lazilyMerge data files
islice()Slice any iteratorTake first N from generator
groupby()Group consecutive elementsProcess sorted logs by date
product()Cartesian productHyperparameter grid
combinations()All r-length combosFeature interaction pairs
starmap()map() with unpacked argsApply function to paired data
accumulate()Running accumulatorCumulative sums, running max
tee()Clone iterator N timesMultiple passes over stream
+ +

10. File I/O for Real Projects

+ + + + + + + + +
FormatReadWriteBest For
JSONjson.load(f)json.dump(obj, f)Configs, API responses
CSVcsv.DictReader(f)csv.DictWriter(f)Tabular data (small)
YAMLyaml.safe_load(f)yaml.dump(obj, f)Config files
Picklepickle.load(f)pickle.dump(obj, f)Python objects, models
Parquetpd.read_parquet()df.to_parquet()Large DataFrames (fast)
SQLitesqlite3.connect()SQL queriesLocal database
+ +

11. pathlib β€” Modern File Handling

+

Stop using os.path.join(). Use pathlib.Path: Path('data') / 'train' / 'images'. Methods: .glob(), .read_text(), .mkdir(parents=True), .exists(), .suffix, .stem. Cross-platform, readable, powerful.

+ +

12. Virtual Environments & Dependency Management

+ + + + + + + +
ToolBest ForKey Feature
venvSimple projectsBuilt-in, lightweight
condaDS/ML (C deps)Handles CUDA, MKL, OpenCV
poetryModern packagingLock files, deterministic builds
uvSpeed10-100x faster pip (Rust-based)
pip-toolsRequirements pinningpip-compile for lock files
+ +

13. Project Structure Template

+
my_project/ +β”œβ”€β”€ src/ +β”‚ └── my_package/ +β”‚ β”œβ”€β”€ __init__.py +β”‚ β”œβ”€β”€ data/ # Data loading & processing +β”‚ β”œβ”€β”€ models/ # Model definitions +β”‚ β”œβ”€β”€ training/ # Training loops +β”‚ β”œβ”€β”€ evaluation/ # Metrics & evaluation +β”‚ β”œβ”€β”€ serving/ # API endpoints +β”‚ └── utils/ # Shared utilities +β”œβ”€β”€ tests/ +β”‚ β”œβ”€β”€ conftest.py # Shared fixtures +β”‚ β”œβ”€β”€ test_data.py +β”‚ └── test_models.py +β”œβ”€β”€ configs/ # YAML/JSON configs +β”œβ”€β”€ notebooks/ # EDA notebooks +β”œβ”€β”€ scripts/ # CLI scripts +β”œβ”€β”€ pyproject.toml # Modern Python packaging +β”œβ”€β”€ Dockerfile +β”œβ”€β”€ Makefile # Common commands +└── README.md
+ +

14. String Operations for Data Cleaning

+

f-strings (3.6+): f"{accuracy:.2%}" β†’ "95.23%". f"{x=}" (3.8+) β†’ "x=42" for debugging. f"{name!r}" β†’ shows repr. regex: re.compile(pattern) for repeated use. re.sub() for cleaning. re.findall() for extraction. Always compile patterns used in loops.

+ +

15. Command-Line Interface (CLI) Tools

+

argparse: Built-in CLI parsing. click: Decorator-based, more Pythonic. typer: Modern, uses type hints. Every production project needs a CLI for: training, evaluation, data processing, deployment scripts.

+
`, + code: ` +
+

πŸ’» Python Fundamentals β€” Project Code

+ +

1. Generator Pipeline β€” Process Any Size Data

+
import json +from pathlib import Path + +def read_jsonl(filepath): + """Read JSON Lines file lazily β€” handles any size.""" + with open(filepath) as f: + for line in f: + yield json.loads(line.strip()) + +def filter_records(records, min_score=0.5): + for rec in records: + if rec.get('score', 0) >= min_score: + yield rec + +def batch(iterable, size=64): + """Batch any iterable into fixed-size chunks.""" + from itertools import islice + it = iter(iterable) + while chunk := list(islice(it, size)): + yield chunk + +# Compose into pipeline β€” still O(1) memory! +pipeline = batch(filter_records(read_jsonl("data.jsonl")), size=32) +for chunk in pipeline: + process(chunk) # Only 32 records in memory at a time
+ +

2. Coroutine Pattern β€” Running Statistics

+
def running_stats(): + """Coroutine that computes running mean & variance.""" + n = 0 + mean = 0.0 + M2 = 0.0 + while True: + x = yield {'mean': mean, 'var': M2/n if n > 0 else 0, 'n': n} + n += 1 + delta = x - mean + mean += delta / n + M2 += delta * (x - mean) # Welford's algorithm β€” numerically stable + +stats = running_stats() +next(stats) # Prime +stats.send(10) # {'mean': 10.0, 'var': 0, 'n': 1} +stats.send(20) # {'mean': 15.0, 'var': 25.0, 'n': 2}
+ +

3. Custom Exception Hierarchy for Projects

+
# Define project-specific exceptions +class ProjectError(Exception): + """Base exception for the project.""" + +class DataValidationError(ProjectError): + def __init__(self, column, expected, actual): + self.column = column + super().__init__( + f"Column '{column}': expected {expected}, got {actual}" + ) + +class ModelNotTrainedError(ProjectError): + pass + +# Usage with proper error handling +def load_and_validate(path): + try: + df = pd.read_csv(path) + except FileNotFoundError: + raise DataValidationError("file", "exists", "missing") + except pd.errors.EmptyDataError: + raise DataValidationError("data", "non-empty", "empty file") + else: + print(f"Loaded {len(df)} rows") + return df + finally: + print("Load attempt complete")
+ +

4. Closures & Mutable Default Trap

+
# ⚠️ THE #1 PYTHON BUG β€” Mutable default argument +def bad_append(item, lst=[]): # List shared across ALL calls! + lst.append(item) + return lst +bad_append(1) # [1] +bad_append(2) # [1, 2] ← SURPRISE! + +# βœ… CORRECT β€” use None sentinel +def good_append(item, lst=None): + if lst is None: + lst = [] + lst.append(item) + return lst
+ +

5. collections in Action

+
from collections import defaultdict, Counter, deque + +# defaultdict β€” group data without KeyError +samples_by_label = defaultdict(list) +for feat, label in zip(features, labels): + samples_by_label[label].append(feat) + +# Counter β€” class distribution + top-N +dist = Counter(y_train) +print(dist.most_common(3)) +imbalance_ratio = dist.most_common()[0][1] / dist.most_common()[-1][1] + +# deque β€” sliding window for streaming +window = deque(maxlen=5) +for val in data_stream: + window.append(val) + moving_avg = sum(window) / len(window)
+ +

6. CLI Tool with argparse

+
import argparse + +def main(): + parser = argparse.ArgumentParser(description="Train ML model") + parser.add_argument("--data", required=True, help="Path to data") + parser.add_argument("--model", choices=["rf", "xgb", "lgbm"], default="rf") + parser.add_argument("--epochs", type=int, default=10) + parser.add_argument("--lr", type=float, default=0.001) + parser.add_argument("--dry-run", action="store_true") + args = parser.parse_args() + + print(f"Training {args.model} on {args.data}") + # python train.py --data data.csv --model xgb --epochs 50 + +if __name__ == "__main__": + main()
+ +

7. Advanced Comprehensions & Modern Python

+
# Walrus operator (:=) β€” assign + use (3.8+) +if (n := len(data)) > 1000: + print(f"Large dataset: {n} samples") + +# Dict merge (3.9+) +config = defaults | overrides + +# match-case β€” Structural Pattern Matching (3.10+) +match command: + case {"action": "train", "model": model_name}: + train(model_name) + case {"action": "predict", "data": path}: + predict(path) + case _: + print("Unknown command") + +# Extended unpacking +first, *middle, last = sorted(scores) + +# Nested dict comprehension +metrics = { + model: {metric: score for metric, score in results.items()} + for model, results in all_results.items() +}
+ +

8. Regex for Data Cleaning

+
import re + +# Compile patterns used repeatedly (10x faster) +EMAIL = re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}') +PHONE = re.compile(r'\b\d{3}[-.]?\d{3}[-.]?\d{4}\b') + +# Extract all emails from text +emails = EMAIL.findall(text) + +# Clean text for NLP +def clean_text(text): + text = re.sub(r'http\S+', '', text) # Remove URLs + text = re.sub(r'[^a-zA-Z\s]', '', text) # Keep only letters + text = re.sub(r'\s+', ' ', text).strip() # Normalize whitespace + return text.lower()
+ +

9. Configuration Management

+
import json, yaml +from pathlib import Path +from dataclasses import dataclass, asdict + +@dataclass +class Config: + model_name: str = "random_forest" + learning_rate: float = 0.001 + batch_size: int = 32 + epochs: int = 100 + data_path: str = "data/train.csv" + + @classmethod + def from_yaml(cls, path): + with open(path) as f: + return cls(**yaml.safe_load(f)) + + def save(self, path): + Path(path).write_text(json.dumps(asdict(self), indent=2)) + +config = Config.from_yaml("configs/experiment.yaml")
+
`, + interview: ` +
+

🎯 Python Fundamentals β€” Interview Questions

+
Q1: List vs tuple β€” when to use which?

Answer: Tuples: immutable, hashable (dict keys), less memory. Lists: mutable, growable. Use tuples for fixed data (coordinates, config). Use lists for collections that change. Tuples signal "this shouldn't be modified."

+
Q2: How does Python's GIL affect DS?

Answer: GIL prevents multi-threading for CPU-bound Python. But NumPy/Pandas release the GIL during C operations. For pure Python CPU work β†’ multiprocessing. For I/O β†’ threading works. For data science, the GIL rarely matters.

+
Q3: Shallow vs deep copy?

Answer: copy.copy(): outer container copied, inner objects shared. copy.deepcopy(): everything copied recursively. Real trap: df2 = df is NOT a copy β€” it's aliasing. Use df.copy().

+
Q4: What is the mutable default argument trap?

Answer: def f(x, lst=[]): β€” default list created ONCE and shared. Fix: lst=None; if lst is None: lst = []. #1 Python interview gotcha.

+
Q5: Why are generators critical for large data?

Answer: O(1) memory. 1B items as list = 8GB. As generator = 100 bytes. Use for: file processing, streaming, batch training. yield from for composition.

+
Q6: Explain LEGB scope rule.

Answer: Name lookup order: Local β†’ Enclosing β†’ Global β†’ Built-in. nonlocal for enclosing scope, global for module. list = [1] shadows built-in list().

+
Q7: How to handle a 10GB CSV?

Answer: (1) pd.read_csv(chunksize=N), (2) usecols=['needed'], (3) dtype={'col':'int32'}, (4) Dask, (5) DuckDB for SQL on CSV, (6) Polars for Rust-speed.

+
Q8: Dict lookup O(1) vs list search O(n)?

Answer: Dicts use hash tables. Key β†’ hash β†’ slot index. O(1) average. Lists scan linearly. x in set is O(1) but x in list is O(n). For 1M items: microseconds vs milliseconds.

+
Q9: Explain Python's garbage collection.

Answer: (1) Reference counting — freed at count=0. (2) Cyclic GC — detects A→B→A cycles. 3 generations. gc.collect() after deleting large models.

+
Q10: What is __slots__?

Answer: Replaces per-instance __dict__ with fixed array. ~40% memory savings. Use for millions of small objects. Trade-off: no dynamic attributes.

+
Q11: How do you structure a Python project?

Answer: src/package/ layout. pyproject.toml for config. tests/ with pytest. configs/ for YAML. Makefile for common commands. Separate data, models, training, serving.

+
Q12: What's the difference between is and ==?

Answer: == checks value equality. is checks identity (same memory). Use is only for singletons: x is None, x is True. Integer interning makes 256 is 256 True but 1000 is 1000 may be False.

+
` + }, + +"numpy": { + concepts: ` +
+

πŸ”’ NumPy β€” Complete Deep Dive

+ +
+
⚑ Why NumPy Is 50-100x Faster
+
(1) Contiguous memory β€” CPU cache-friendly. (2) Compiled C loops. (3) SIMD instructions β€” 4-8 floats simultaneously. Python list: array of pointers to objects. NumPy: raw typed data in a block.
+
+ +

1. ndarray Internals

+ + + + + + +
FeaturePython ListNumPy ndarray
StoragePointers to objectsContiguous typed data
Memory per int~28 bytes + pointer8 bytes (int64)
OperationsPython loopCompiled C/Fortran
SIMDImpossibleCPU vector instructions
+ +

2. Memory Layout & Strides

+
+
🧠 Strides = The Secret Behind Views
+
Every ndarray has strides β€” bytes to jump in each dimension. For (3,4) float64: strides = (32, 8). Slicing creates views (no copy) by adjusting strides. arr[::2] doubles row stride. C-order (row-major): rows contiguous. Fortran-order: columns contiguous. Iterate along last axis for best performance.
+
+ +

3. Broadcasting Rules

+
+
🎯 Rules (Right to Left)
+
Two arrays compatible when, for each trailing dim: dims are equal OR one is 1. (5,3,1) + (1,4) β†’ (5,3,4). The "1" dims stretch virtually β€” no memory copied. Common: X - X.mean(axis=0) β†’ (1000,5) - (5,) works!
+
+ +

4. Universal Functions (ufuncs)

+

Vectorized element-wise functions. Advanced methods: .reduce() (fold), .accumulate() (running total), .outer() (outer product), .at() (unbuffered in-place). Create custom with np.frompyfunc().

+ +

5. dtype Selection for Projects

+ + + + + + + + +
dtypeBytesWhen to Use
float324Deep learning, GPU (50% less memory)
float648Default. Statistics, scientific computing
float162Mixed-precision inference
int324Indices, counts
int81Quantized models
bool1Masks for filtering
+ +

6. np.einsum β€” One Function for All Tensor Ops

+

Einstein summation: express ANY tensor operation. Matrix multiply: 'ik,kj->ij'. Batch matmul: 'bij,bjk->bik'. Trace: 'ii->'. Often faster than chaining NumPy calls β€” avoids intermediate arrays.

+ +

7. Linear Algebra for ML Projects

+ + +

8. Random Number Generation

+

Modern: rng = np.random.default_rng(42) (NumPy 1.17+). PCG64 algorithm, thread-safe. Old np.random.seed(42) is global, not thread-safe. Always use default_rng() in projects.

+ +

9. Image Processing with NumPy

+

Images are just 3D arrays: (height, width, channels). Crop: img[100:200, 50:150]. Resize: scipy. Normalize: img / 255.0. Augment: flip img[:, ::-1], rotate with scipy.ndimage. Foundation of all computer vision.

+
`, + code: ` +
+

πŸ’» NumPy Project Code

+ +

1. Feature Engineering with Broadcasting

+
import numpy as np + +# Z-score normalization +X = np.random.randn(1000, 5) +X_norm = (X - X.mean(axis=0)) / X.std(axis=0) # (1000,5) - (5,) + +# Min-Max scaling +X_scaled = (X - X.min(0)) / (X.max(0) - X.min(0) + 1e-8) + +# Pairwise Euclidean distance matrix +diff = X[:, np.newaxis, :] - X[np.newaxis, :, :] # (N,1,D)-(1,N,D) +dist_matrix = np.sqrt((diff ** 2).sum(axis=-1)) # (N,N)
+ +

2. Boolean Masking & Advanced Indexing

+
# Remove outliers (3-sigma rule) +data = np.random.randn(10000) +clean = data[np.abs(data - data.mean()) < 3 * data.std()] + +# np.where β€” conditional replacement +preds = np.array([0.3, 0.7, 0.1, 0.9]) +labels = np.where(preds > 0.5, 1, 0) + +# np.select β€” multiple conditions +conditions = [data < -1, data > 1] +choices = ['low', 'high'] +category = np.select(conditions, choices, default='mid') + +# Fancy indexing β€” sample without replacement +rng = np.random.default_rng(42) +idx = rng.choice(len(X), size=500, replace=False) +X_sample = X[idx]
+ +

3. einsum for Complex Operations

+
# Matrix multiply +C = np.einsum('ik,kj->ij', A, B) + +# Batch matrix multiply (deep learning) +batch_result = np.einsum('bij,bjk->bik', batch_A, batch_B) + +# Cosine similarity matrix +norms = np.linalg.norm(X, axis=1, keepdims=True) +X_normed = X / norms +sim = np.einsum('ij,kj->ik', X_normed, X_normed)
+ +

4. Implement Linear Regression from Scratch

+
# Normal equation: w = (X^T X)^(-1) X^T y +# Better: use lstsq for numerical stability +X_b = np.c_[np.ones((len(X), 1)), X] # Add bias column +w, residuals, rank, sv = np.linalg.lstsq(X_b, y, rcond=None) +y_pred = X_b @ w +mse = ((y - y_pred) ** 2).mean() +r2 = 1 - ((y - y_pred)**2).sum() / ((y - y.mean())**2).sum()
+ +

5. Memory-Mapped Files for Huge Data

+
# Process arrays larger than RAM +big = np.memmap('huge.npy', dtype=np.float32, + mode='w+', shape=(1000000, 100)) +subset = big[5000:6000] # Only reads 1000 rows from disk + +# Structured arrays β€” mixed types without Pandas +dt = np.dtype([('name', 'U10'), ('age', 'i4'), ('score', 'f8')]) +data = np.array([('Alice', 30, 95.5)], dtype=dt)
+ +

6. Implement PCA from Scratch

+
def pca(X, n_components): + # Center the data + X_centered = X - X.mean(axis=0) + # Covariance matrix + cov = X_centered.T @ X_centered / (len(X) - 1) + # Eigendecomposition + eigenvalues, eigenvectors = np.linalg.eigh(cov) + # Sort by largest eigenvalue + idx = eigenvalues.argsort()[::-1][:n_components] + components = eigenvectors[:, idx] + # Project data + X_pca = X_centered @ components + explained_var = eigenvalues[idx] / eigenvalues.sum() + return X_pca, explained_var, components
+
`, + interview: ` +
+

🎯 NumPy Interview Questions

+
Q1: Why is NumPy faster than Python lists?

Answer: (1) Contiguous memory (cache-friendly). (2) Compiled C loops. (3) SIMD instructions. Together: 50-100x speedup.

+
Q2: View vs copy?

Answer: Slicing = view (shares data). Fancy indexing = copy. Check: np.shares_memory(a, b). Views are dangerous: modifying view modifies original.

+
Q3: Broadcasting rules?

Answer: Right-to-left: dims must equal or one is 1. (3,1) + (1,4) β†’ (3,4). No memory copied. Gotcha: (3,) + (3,4) fails β€” reshape to (3,1).

+
Q4: axis=0 vs axis=1?

Answer: axis=0: operate down rows (collapse rows). axis=1: across columns (collapse columns). (100,5): mean(axis=0)β†’(5,). mean(axis=1)β†’(100,).

+
Q5: Implement PCA with NumPy?

Answer: Center, compute covariance, eigendecompose (eigh), sort by eigenvalue, project onto top-k eigenvectors. Or SVD directly.

+
Q6: np.dot vs @ vs einsum?

Answer: @: clean, broadcasts. np.dot: confusing for 3D+. einsum: most flexible, any tensor op. Use @ for readability.

+
Q7: How to handle NaN?

Answer: np.isnan() detects. np.nanmean() ignores NaN. Gotcha: NaN == NaN is False (IEEE 754).

+
Q8: C-order vs Fortran-order?

Answer: C: rows contiguous (default). Fortran: columns contiguous (LAPACK/BLAS). Iterate last axis for speed. Convert: np.asfortranarray().

+
` +}, + +"pandas": { + concepts: ` +
+

🐼 Pandas β€” Complete Deep Dive

+ +
+
⚑ DataFrame Internals β€” BlockManager
+
A DataFrame is NOT a 2D array. Uses BlockManager β€” same-dtype columns stored in contiguous blocks. Column operations: fast (same block). Row iteration: slow (crosses blocks). This is why df.iterrows() is 100x slower than vectorized ops.
+
+ +

1. The Golden Rules

+
+
⚠️ 5 Rules That Prevent 90% of Pandas Bugs
+ 1. Use .loc (label) and .iloc (position) β€” never chain indexing.
+ 2. df.loc[0:5] includes 5. df.iloc[0:5] excludes 5.
+ 3. df[mask]['col'] = x creates copy. Use df.loc[mask, 'col'] = x.
+ 4. df2 = df is NOT a copy. Use df2 = df.copy().
+ 5. Always check df.dtypes and df.isna().sum() first. +
+ +

2. GroupBy β€” Split-Apply-Combine

+

Most powerful Pandas operation. (1) Split β†’ (2) Apply function β†’ (3) Combine results. GroupBy is lazy β€” no computation until aggregation. Key methods:

+ + + + + + +
MethodOutput ShapeUse Case
agg()Reduced (one row/group)Sum, mean, count per group
transform()Same as inputFill with group mean, normalize within group
filter()Subset of groupsKeep groups with N > 100
apply()FlexibleCustom function per group
+ +

3. Pandas 2.0 β€” Major Changes

+ + + + + + +
FeatureBefore (1.x)After (2.0+)
BackendNumPy onlyApache Arrow option
Copy semanticsConfusingCopy-on-Write
String dtypeobjectstring[pyarrow] (faster)
Nullable typesNaN for everythingpd.NA (proper null)
+ +

4. Polars vs Pandas

+ + + + + + + +
FeaturePandasPolars
Speed1x5-50x (Rust)
ParallelismSingle-threadedMulti-threaded auto
APIEagerLazy + Eager
EcosystemMassiveGrowing fast
Use whenEDA, small-med data, legacyLarge data, production
+ +

5. Merge/Join Patterns

+ + + + + +
MethodHowWhen
merge()SQL-style joins on columnsCombine tables on shared keys
join()Joins on indexIndex-based combining
concat()Stack along axisAppend rows/columns
+

Common pitfall: Merge produces more rows than expected = many-to-many join. Always check: len(merged) vs len(left).

+ +

6. Memory Optimization Strategies

+ + + + + + + +
StrategySavingsWhen
Category dtype90%+Few unique strings
Downcast numerics50-75%int64 β†’ int32/int16
Sparse arrays80%+Mostly zeros/NaN
PyArrow backend30-50%String-heavy data
Read only needed columnsVariableusecols=['a','b']
+ +

7. Window Functions for Time Series

+

.rolling(N): fixed sliding window. .expanding(): cumulative. .ewm(span=N): exponentially weighted. All support .mean(), .std(), .apply(). Essential for: lag features, moving averages, volatility, Bollinger bands.

+ +

8. Pivot Tables & Crosstab

+

df.pivot_table(values, index, columns, aggfunc) β€” summarize data by two categorical dimensions. pd.crosstab() β€” frequency table of two categorical columns. Essential for EDA and business reporting.

+ +

9. Method Chaining Pattern

+

Fluent API: .assign() instead of df['col']=. .pipe(func) for custom. .query('col > 5') for readable filters. No intermediate variables = cleaner, reproducible pipelines.

+
`, + code: ` +
+

πŸ’» Pandas Project Code

+ +

1. Complete Data Loading & Cleaning Pipeline

+
import pandas as pd +import numpy as np + +def load_and_clean(path, config): + """Production data loading pipeline.""" + df = ( + pd.read_csv(path, usecols=config['columns'], + dtype=config.get('dtypes', None), + parse_dates=config.get('date_cols', [])) + .rename(columns=str.lower) + .drop_duplicates() + .assign( + date=lambda df: pd.to_datetime(df['date']), + revenue=lambda df: df['price'] * df['qty'] + ) + .query('revenue > 0') + .pipe(optimize_dtypes) + ) + return df
+ +

2. GroupBy β€” Beyond Basics

+
# Named aggregation +summary = df.groupby('category').agg( + total=('revenue', 'sum'), + avg_price=('price', 'mean'), + n_orders=('order_id', 'nunique'), + top_product=('product', lambda x: x.value_counts().index[0]) +) + +# Transform β€” normalize within groups +df['pct_of_group'] = df.groupby('cat')['rev'].transform( + lambda x: x / x.sum() * 100 +) + +# Filter β€” keep only groups with enough data +df_filtered = df.groupby('user').filter(lambda x: len(x) >= 5)
+ +

3. Time Series Feature Engineering

+
def create_time_features(df, date_col, target_col): + """Generate time series features for ML.""" + df = df.sort_values(date_col).copy() + + # Lag features + for lag in [1, 3, 7, 14, 30]: + df[f'lag_{lag}'] = df[target_col].shift(lag) + + # Rolling statistics + for window in [7, 14, 30]: + df[f'rolling_mean_{window}'] = df[target_col].rolling(window).mean() + df[f'rolling_std_{window}'] = df[target_col].rolling(window).std() + + # Date features + df['dayofweek'] = df[date_col].dt.dayofweek + df['month'] = df[date_col].dt.month + df['is_weekend'] = df['dayofweek'].isin([5, 6]).astype(int) + + # Percentage change + df['pct_change'] = df[target_col].pct_change() + + return df
+ +

4. Memory Optimization

+
def optimize_dtypes(df): + """Reduce DataFrame memory by 60-80%.""" + start_mem = df.memory_usage(deep=True).sum() / 1024**2 + + for col in df.select_dtypes(['int']).columns: + df[col] = pd.to_numeric(df[col], downcast='integer') + for col in df.select_dtypes(['float']).columns: + df[col] = pd.to_numeric(df[col], downcast='float') + for col in df.select_dtypes(['object']).columns: + if df[col].nunique() / len(df) < 0.5: + df[col] = df[col].astype('category') + + end_mem = df.memory_usage(deep=True).sum() / 1024**2 + print(f"Memory: {start_mem:.1f}MB β†’ {end_mem:.1f}MB ({100*(1-end_mem/start_mem):.0f}% reduction)") + return df
+ +

5. Merge with Validation

+
# LEFT JOIN with indicator for debugging +merged = pd.merge(orders, customers, on='customer_id', + how='left', indicator=True, validate='many_to_one') + +# Check for orphan records +orphans = merged[merged['_merge'] == 'left_only'] +print(f"Orphan orders: {len(orphans)}") + +# Multi-key merge +result = pd.merge(df1, df2, on=['date', 'product_id'], + how='inner', suffixes=('_actual', '_predicted'))
+ +

6. Pivot Table for Business Reporting

+
# Revenue by month and category +pivot = df.pivot_table( + values='revenue', + index=df['date'].dt.to_period('M'), + columns='category', + aggfunc=['sum', 'count'], + margins=True # Add totals row/column +) + +# Crosstab β€” frequency of two categorical columns +ct = pd.crosstab(df['region'], df['product'], normalize='index')
+
`, + interview: ` +
+

🎯 Pandas Interview Questions

+
Q1: SettingWithCopyWarning?

Answer: Chained indexing modifies copy. Fix: df.loc[mask, 'col'] = val. Pandas 2.0+ Copy-on-Write eliminates this.

+
Q2: merge vs join vs concat?

Answer: merge: SQL joins on columns. join: on index. concat: stack along axis. Use merge for column joins, concat for appending.

+
Q3: apply vs map vs transform?

Answer: map: Series element-wise. apply: rows/columns. transform: same-shape output. All slow β€” prefer vectorized when possible.

+
Q4: GroupBy transform vs agg?

Answer: agg reduces. transform broadcasts back. Use transform for "fill with group mean" or "normalize within group" patterns.

+
Q5: How to handle missing data?

Answer: (1) dropna(thresh=N), (2) fillna(method='ffill') for time series, (3) fillna(df.median()) for ML, (4) interpolate(method='time'). Always check df.isna().sum() first.

+
Q6: Pandas vs Polars?

Answer: Polars: 5-50x faster (Rust), multi-threaded, lazy eval. Pandas: mature ecosystem, wide compatibility. New projects with big data β†’ Polars.

+
Q7: What is MultiIndex?

Answer: Hierarchical indexing. Use for pivot tables, panel data. Access with .xs() or tuple. Reset with .reset_index().

+
Q8: How to optimize a 5GB DataFrame?

Answer: (1) Read only needed columns. (2) Downcast dtypes. (3) Category for strings. (4) Sparse for zeros. (5) PyArrow backend. (6) Process in chunks. Can reduce 5GB to 1GB.

+
` +}, + +"visualization": { + concepts: ` +
+

πŸ“Š Data Visualization β€” Complete Guide

+ +
+
⚑ The Grammar of Graphics
+
Data + Aesthetics (x, y, color, size) + Geometry (bars, lines, points) + Statistics (binning, smoothing) + Coordinates (cartesian, polar) + Facets (subplots). Every chart = this framework.
+
+ +

1. Choosing the Right Chart

+ + + + + + + + + + + +
QuestionChart TypeLibrary
Distribution?Histogram, KDE, Box, ViolinSeaborn
Relationship?Scatter, Hexbin, RegressionSeaborn/Plotly
Comparison?Bar, Grouped bar, ViolinSeaborn
Trend over time?Line, Area chartPlotly/Matplotlib
Correlation?HeatmapSeaborn
Part of whole?Pie, Treemap, SunburstPlotly
Geographic?Choropleth, MapboxPlotly/Folium
High-dimensional?Parallel coords, UMAPPlotly
ML results?Confusion matrix, ROC, SHAPSeaborn/SHAP
+ +

2. Matplotlib Architecture

+

Three layers: Backend (rendering), Artist (everything drawn), Scripting (pyplot). Figure β†’ Axes (subplots) β†’ Axis objects. Always use OO API: fig, ax = plt.subplots().

+

rcParams: Global defaults. plt.rcParams['font.size'] = 14. Create style files for project consistency. plt.style.use('seaborn-v0_8-whitegrid').

+ +

3. Color Theory for Data

+
+
πŸ’‘ Color Guide
+ Sequential: viridis, plasma (low→high).
+ Diverging: RdBu, coolwarm (center matters).
+ Categorical: Set2, tab10 (distinct groups).
+ Never use rainbow/jet β€” bad for colorblind, perceptually non-uniform. +
+ +

4. Seaborn β€” Statistical Visualization

+

Three API levels: Figure-level (relplot, catplot, displot), Axes-level (scatterplot, boxplot), Objects API (0.12+). Auto-computes regression lines, confidence intervals, density estimates.

+ +

5. Plotly β€” Interactive Dashboards

+

JavaScript-powered: hover, zoom, selection. plotly.express for quick plots. plotly.graph_objects for control. Integrates with Dash for production dashboards. Supports 3D, maps, animations. Export to HTML.

+ +

6. Visualization for ML Projects

+ + + + + + + + + + + +
What to VisualizeChartWhy
Class distributionBar chartDetect imbalance
Feature distributionsHistogram/KDE gridFind skew, outliers
Feature correlationsHeatmap (triangular)Multicollinearity
Training curvesLine plot (loss/acc vs epoch)Detect overfit/underfit
Model comparisonBox plot of CV scoresCompare variance
Confusion matrixAnnotated heatmapError analysis
ROC curveLine plot + AUCThreshold selection
Feature importanceHorizontal barModel interpretation
SHAP valuesBeeswarm/waterfallIndividual predictions
+ +

7. Common Mistakes

+ +
`, + code: ` +
+

πŸ’» Visualization Project Code

+ +

1. Publication-Quality Multi-Subplot Figure

+
import matplotlib.pyplot as plt +import numpy as np + +# Professional style setup +plt.rcParams.update({ + 'font.size': 12, 'axes.titlesize': 14, + 'figure.facecolor': 'white', + 'axes.spines.top': False, 'axes.spines.right': False +}) + +fig, axes = plt.subplots(2, 2, figsize=(14, 10)) + +# Distribution +axes[0,0].hist(data, bins=30, alpha=0.7, color='steelblue', edgecolor='white') +axes[0,0].axvline(data.mean(), color='red', linestyle='--', label='Mean') +axes[0,0].set_title('Distribution') + +# Scatter with colormap +sc = axes[0,1].scatter(x, y, c=z, cmap='viridis', alpha=0.7) +plt.colorbar(sc, ax=axes[0,1]) + +# Line with confidence interval +axes[1,0].plot(x, y_mean, 'b-', linewidth=2) +axes[1,0].fill_between(x, y_mean-y_std, y_mean+y_std, alpha=0.2) + +# Bar with error bars +axes[1,1].bar(categories, values, yerr=errors, capsize=5, color='coral') + +plt.tight_layout() +plt.savefig('figure.png', dpi=300, bbox_inches='tight')
+ +

2. ML Evaluation Dashboard

+
import seaborn as sns +from sklearn.metrics import confusion_matrix, roc_curve, auc + +def plot_model_evaluation(y_true, y_pred, y_proba): + fig, axes = plt.subplots(1, 3, figsize=(18, 5)) + + # Confusion Matrix + cm = confusion_matrix(y_true, y_pred) + sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', ax=axes[0]) + axes[0].set_title('Confusion Matrix') + + # ROC Curve + fpr, tpr, _ = roc_curve(y_true, y_proba) + axes[1].plot(fpr, tpr, label=f'AUC={auc(fpr,tpr):.3f}') + axes[1].plot([0,1], [0,1], 'k--') + axes[1].set_title('ROC Curve') + axes[1].legend() + + # Feature Importance + importance = model.feature_importances_ + idx = importance.argsort() + axes[2].barh(feature_names[idx], importance[idx]) + axes[2].set_title('Feature Importance') + + plt.tight_layout()
+ +

3. Seaborn β€” EDA in One Call

+
# Pair plot β€” all relationships at once +sns.pairplot(df, hue='target', diag_kind='kde', + plot_kws={'alpha': 0.6}) + +# Correlation heatmap (upper triangle) +mask = np.triu(np.ones_like(df.corr(), dtype=bool)) +sns.heatmap(df.corr(), mask=mask, annot=True, + fmt='.2f', cmap='RdBu_r', center=0)
+ +

4. Plotly β€” Interactive Dashboard

+
import plotly.express as px +from plotly.subplots import make_subplots +import plotly.graph_objects as go + +# Animated scatter (Gapminder style) +fig = px.scatter(df, x='gdp', y='life_exp', + animation_frame='year', size='pop', + color='continent', hover_name='country') + +# Training curves dashboard +fig = make_subplots(rows=1, cols=2, + subplot_titles=['Loss', 'Accuracy']) +fig.add_trace(go.Scatter(y=train_loss, name='Train Loss'), row=1, col=1) +fig.add_trace(go.Scatter(y=val_loss, name='Val Loss'), row=1, col=1) +fig.add_trace(go.Scatter(y=train_acc, name='Train Acc'), row=1, col=2) +fig.add_trace(go.Scatter(y=val_acc, name='Val Acc'), row=1, col=2) +fig.write_html('training_dashboard.html')
+
`, + interview: ` +
+

🎯 Visualization Interview Questions

+
Q1: Matplotlib vs Seaborn vs Plotly?

Answer: Matplotlib: full control, papers. Seaborn: statistical EDA, beautiful. Plotly: interactive, stakeholders. Rule: Seaborn→EDA, Matplotlib→papers, Plotly→stakeholders.

+
Q2: How to visualize high-dimensional data?

Answer: (1) PCA/t-SNE/UMAP to 2D, (2) Pair plots, (3) Parallel coordinates, (4) Correlation heatmap, (5) SHAP plots.

+
Q3: Handle overplotting?

Answer: alpha, hexbin, 2D KDE, random sampling, Datashader for millions of points.

+
Q4: Good viz for non-technical audience?

Answer: Title states conclusion. One insight per chart. Annotate key points. Consistent color. Minimal chart junk. Tell a story.

+
Q5: Figure vs Axes?

Answer: Figure = canvas. Axes = plot area. fig, axes = plt.subplots(2,2). Use OO API: ax.plot() not plt.plot().

+
Q6: Accessible visualizations?

Answer: Colorblind palettes (viridis), shapes not just color, sufficient contrast, alt text, 12pt+ fonts.

+
Q7: How to visualize model performance?

Answer: Training curves (loss/acc vs epoch), confusion matrix (heatmap), ROC/AUC, feature importance (horizontal bars), SHAP for interpretability.

+
` +}, + +"advanced-python": { + concepts: ` +
+

🎯 Advanced Python β€” Complete Engineering Guide

+ +

1. Decorators β€” Complete Patterns

+
+
⚑ Three Levels of Decorators
+
Level 1: Simple wrapper (timing, logging). Level 2: With arguments (factory). Level 3: Class-based with state. Always use functools.wraps.
+
+

Common patterns: Retry with exponential backoff, caching, rate limiting, authentication, input validation, deprecation warnings.

+ +

2. Context Managers

+

Guarantee resource cleanup. Two approaches: (1) Class-based (__enter__/__exit__), (2) @contextlib.contextmanager with yield. Use for: files, DB connections, GPU locks, temporary settings, timers.

+ +

3. Dataclasses vs namedtuple vs Pydantic vs attrs

+ + + + + + + +
FeaturenamedtupledataclassPydanticattrs
Mutableβœ—βœ“βœ“ (v2)βœ“
Validationβœ—βœ—βœ“ (auto)βœ“ (validators)
JSONβœ—βœ—βœ“ (built-in)via cattrs
PerformanceFastestFastMediumFast
Use forRecordsData containersAPI modelsComplex classes
+ +

4. Type Hints β€” Complete Guide

+
+
🎯 Why Type Hints Matter for Projects
+
Enable: IDE autocompletion, mypy static analysis, self-documenting code, Pydantic validation. Python doesn't enforce at runtime β€” they're for tools and humans.
+
+ + + + + + + + + +
HintMeaningExample
list[int]List of ints (3.9+)scores: list[int] = []
dict[str, Any]Dict str keysconfig: dict[str, Any]
int | NoneOptional (3.10+)x: int | None = None
Callable[[int], str]Function typeCallbacks
TypeVarGenericGeneric containers
LiteralExact valuesLiteral['train','test']
TypedDictDict with typed keysJSON schemas
+ +

5. async/await β€” Concurrent I/O

+

For I/O-bound tasks: API calls, DB queries, file reads. NOT for CPU (use multiprocessing). Event loop manages coroutines cooperatively. asyncio.gather() runs concurrently. Game changer: 100 API calls in ~1s vs 100s sequentially.

+ +

6. Design Patterns for ML Projects

+ + + + + + + + + +
PatternUse CasePython Implementation
StrategySwap algorithmsPass function/class as argument
FactoryCreate objects by nameRegistry dict: models['rf']
ObserverTraining callbacksEvent system with hooks
PipelineData transformationsChain of fit→transform
SingletonModel cache, DB poolModule-level or metaclass
TemplateTraining loopABC with abstract methods
RegistryAuto-register modelsClass decorator + dict
+ +

7. Descriptors β€” How @property Works

+

Any object implementing __get__/__set__/__delete__. @property is a descriptor. Control attribute access at class level. Used in Django ORM, SQLAlchemy, dataclass fields.

+ +

8. Metaclasses β€” Advanced

+

Classes are objects. Metaclasses define how classes behave. type is the default. Use for: auto-registration, interface enforcement, singleton. Most should use class decorators instead.

+ +

9. __slots__ for Memory Efficiency

+

Replaces __dict__ with fixed array. ~40% memory savings per instance. Use for millions of small objects. Trade-off: no dynamic attributes.

+ +

10. Multiprocessing for CPU-Bound Work

+

multiprocessing.Pool or concurrent.futures.ProcessPoolExecutor. Each process has its own GIL. Share data via: multiprocessing.Queue, shared_memory, or serialize (pickle). Overhead: process creation ~100ms. Only use for expensive computations.

+
`, + code: ` +
+

πŸ’» Advanced Python Project Code

+ +

1. Production Decorator β€” Retry with Backoff

+
from functools import wraps +import time, logging + +def retry(max_attempts=3, delay=1.0, exceptions=(Exception,)): + def decorator(func): + @wraps(func) + def wrapper(*args, **kwargs): + for attempt in range(max_attempts): + try: + return func(*args, **kwargs) + except exceptions as e: + if attempt == max_attempts - 1: + raise + wait = delay * (2 ** attempt) + logging.warning(f"Retry {attempt+1}/{max_attempts}: {e}, waiting {wait}s") + time.sleep(wait) + return wrapper + return decorator + +@retry(max_attempts=3, delay=0.5) +def fetch_data(url): + return requests.get(url, timeout=10).json()
+ +

2. Dataclass for ML Experiments

+
from dataclasses import dataclass, field, asdict +import json +from datetime import datetime + +@dataclass +class Experiment: + name: str + model: str + lr: float = 0.001 + epochs: int = 100 + batch_size: int = 32 + tags: list[str] = field(default_factory=list) + timestamp: str = field(default_factory=lambda: datetime.now().isoformat()) + metrics: dict = field(default_factory=dict) + + def __post_init__(self): + if self.lr <= 0: raise ValueError("lr must be positive") + + def save(self, path): + with open(path, 'w') as f: + json.dump(asdict(self), f, indent=2) + + @classmethod + def load(cls, path): + with open(path) as f: + return cls(**json.load(f))
+ +

3. Model Registry Pattern

+
MODEL_REGISTRY = {} + +def register_model(name): + def decorator(cls): + MODEL_REGISTRY[name] = cls + return cls + return decorator + +@register_model("random_forest") +class RandomForestModel: + def train(self, X, y): ... + +@register_model("xgboost") +class XGBoostModel: + def train(self, X, y): ... + +# Create model by name from config +model = MODEL_REGISTRY[config["model_name"]]()
+ +

4. async β€” Parallel API Calls

+
import asyncio +import aiohttp + +async def fetch(session, url): + async with session.get(url) as resp: + return await resp.json() + +async def fetch_all(urls): + async with aiohttp.ClientSession() as session: + tasks = [fetch(session, url) for url in urls] + return await asyncio.gather(*tasks, return_exceptions=True) + +# 100 API calls in ~1 second vs 100 seconds +results = asyncio.run(fetch_all(urls))
+ +

5. Pydantic for API Data Validation

+
from pydantic import BaseModel, Field, field_validator + +class PredictionRequest(BaseModel): + features: list[float] = Field(..., min_length=1) + model_name: str = "default" + threshold: float = Field(0.5, ge=0, le=1) + + @field_validator('features') + @classmethod + def check_features(cls, v): + if any(np.isnan(x) for x in v): + raise ValueError("NaN not allowed") + return v + +# Auto-validates on creation +req = PredictionRequest(features=[1.0, 2.0, 3.0])
+ +

6. Context Manager β€” Timer & GPU Lock

+
from contextlib import contextmanager +import time + +@contextmanager +def timer(name="Block"): + start = time.perf_counter() + try: + yield + finally: + elapsed = time.perf_counter() - start + print(f"{name}: {elapsed:.4f}s") + +with timer("Training"): + model.fit(X_train, y_train)
+
`, + interview: ` +
+

🎯 Advanced Python Interview Questions

+
Q1: Explain MRO.

Answer: C3 Linearization for multiple inheritance. ClassName.mro() shows order. Subclasses before bases, left-to-right.

+
Q2: dataclass vs Pydantic?

Answer: dataclass: no validation, fast, standard library. Pydantic: auto-validation, JSON serialization, API models. Use Pydantic for external data, dataclass for internal.

+
Q3: When async vs threading vs multiprocessing?

Answer: async: I/O-bound, 1000s connections. threading: I/O, simpler. multiprocessing: CPU-bound (bypasses GIL). NumPy releases GIL internally.

+
Q4: How does @property work?

Answer: It's a descriptor with __get__/__set__. Attribute access triggers descriptor protocol. Used for computed attributes and validation.

+
Q5: Decorator with parameters?

Answer: Three nested functions: factory(params) β†’ decorator(func) β†’ wrapper(*args). Use @wraps(func) always.

+
Q6: What is __slots__?

Answer: Fixed array instead of __dict__. ~40% less memory. No dynamic attributes. Use for millions of objects.

+
Q7: Explain closures with use case.

Answer: Function capturing enclosing scope variables. Use: factory functions, decorators, callbacks. make_multiplier(3) returns function multiplying by 3.

+
Q8: Design patterns in Python vs Java?

Answer: Python makes many patterns trivial: Strategy = pass a function. Singleton = module variable. Factory = dict of classes. Observer = list of callables. Python prefers simplicity.

+
` +}, + +"sklearn": { + concepts: ` +
+

πŸ€– Scikit-learn β€” Complete ML Engineering

+ +
+
⚑ The Estimator API
+
Estimators: fit(X, y). Transformers: transform(X). Predictors: predict(X). Consistency allows seamless swapping and composition via Pipelines.
+
+ +

1. Pipelines β€” The Foundation of Production ML

+
+
⚠️ Data Leakage β€” The #1 ML Mistake
+ Fitting scaler on ENTIRE dataset before split = test set info leaks into training. Fix: put ALL preprocessing inside Pipeline. Pipeline ensures fit only on training folds during CV. +
+ +

2. ColumnTransformer β€” Real-World Data

+

Real data has mixed types. ColumnTransformer applies different transformations per column set: StandardScaler for numerics, OneHotEncoder for categoricals, TfidfVectorizer for text. All in one pipeline.

+ +

3. Custom Transformers

+

Inherit BaseEstimator + TransformerMixin. Implement fit(X, y) and transform(X). TransformerMixin gives fit_transform() free. Use check_is_fitted() for safety.

+ +

4. Cross-Validation Strategies

+ + + + + + + +
StrategyWhenKey Point
KFoldGeneralDoesn't preserve class ratios
StratifiedKFoldImbalanced classificationPreserves class distribution
TimeSeriesSplitTime-ordered dataTrain always before test
GroupKFoldGrouped data (patients)Same group never in train+test
RepeatedStratifiedKFoldRobust estimationMultiple random splits
+ +

5. Hyperparameter Tuning

+ + + + + + +
MethodProsCons
GridSearchCVExhaustiveExponential with params
RandomizedSearchCVFaster, continuous distsMay miss optimal
OptunaSmart search, pruningExtra dependency
HalvingSearchCVSuccessive halvingNewer, less docs
+ +

6. Complete ML Workflow

+
+
🎯 The Steps
+
+ 1. EDA β†’ 2. Train/Val/Test split β†’ 3. Build Pipeline (preprocess + model) β†’ 4. Cross-validate multiple models β†’ 5. Select best β†’ 6. Tune hyperparameters β†’ 7. Final evaluation on test set β†’ 8. Save model β†’ 9. Deploy +
+
+ +

7. Feature Engineering

+ + + + + + + +
TransformerPurpose
PolynomialFeaturesInteraction & polynomial terms
FunctionTransformerApply any function (log, sqrt)
SplineTransformerNon-linear feature basis
KBinsDiscretizerBin continuous into categories
TargetEncoderEncode categoricals by target mean
+ +

8. Model Selection Guide

+ + + + + + + +
Data SizeModelWhy
<1K rowsLogistic/SVM/KNNSimple, less overfitting
1K-100KRandom Forest, XGBoostBest accuracy/speed tradeoff
100K+XGBoost, LightGBMHandles large data efficiently
Very largeSGDClassifier/onlineIncremental learning
TabularGradient BoostingAlmost always best for tabular
+ +

9. Handling Imbalanced Data

+ + + + + + + +
StrategyHow
class_weight='balanced'Built-in for most models
SMOTESynthetic oversampling (imblearn)
Threshold tuningAdjust decision threshold from 0.5
MetricsUse F1, Precision-Recall AUC (not accuracy)
EnsembleBalancedRandomForest
+ +

10. Model Persistence

+

joblib.dump(model, 'model.pkl') β€” faster than pickle for NumPy arrays. model = joblib.load('model.pkl'). Always save the entire pipeline (not just model) to include preprocessing. Version your models with timestamps.

+
`, + code: ` +
+

πŸ’» Scikit-learn Project Code

+ +

1. Production Pipeline β€” Complete Template

+
from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer, make_column_selector +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.impute import SimpleImputer +from sklearn.ensemble import RandomForestClassifier +from sklearn.model_selection import cross_val_score + +preprocessor = ColumnTransformer([ + ('num', Pipeline([ + ('imputer', SimpleImputer(strategy='median')), + ('scaler', StandardScaler()) + ]), make_column_selector(dtype_include='number')), + + ('cat', Pipeline([ + ('imputer', SimpleImputer(strategy='constant', fill_value='missing')), + ('encoder', OneHotEncoder(handle_unknown='ignore', sparse_output=False)) + ]), make_column_selector(dtype_include='object')) +]) + +pipe = Pipeline([ + ('preprocessor', preprocessor), + ('classifier', RandomForestClassifier(n_estimators=100, n_jobs=-1)) +]) + +# No data leakage! +scores = cross_val_score(pipe, X, y, cv=5, scoring='f1') +print(f"F1: {scores.mean():.3f} Β± {scores.std():.3f}")
+ +

2. Custom Transformer

+
from sklearn.base import BaseEstimator, TransformerMixin +from sklearn.utils.validation import check_is_fitted + +class OutlierClipper(BaseEstimator, TransformerMixin): + def __init__(self, factor=1.5): + self.factor = factor + + def fit(self, X, y=None): + Q1 = np.percentile(X, 25, axis=0) + Q3 = np.percentile(X, 75, axis=0) + IQR = Q3 - Q1 + self.lower_ = Q1 - self.factor * IQR + self.upper_ = Q3 + self.factor * IQR + return self + + def transform(self, X): + check_is_fitted(self) + return np.clip(X, self.lower_, self.upper_)
+ +

3. Model Comparison Framework

+
from sklearn.model_selection import cross_validate + +models = { + 'Logistic': LogisticRegression(), + 'RF': RandomForestClassifier(n_estimators=100), + 'XGBoost': XGBClassifier(n_estimators=100), + 'LightGBM': LGBMClassifier(n_estimators=100) +} + +results = {} +for name, model in models.items(): + pipe = Pipeline([('prep', preprocessor), ('model', model)]) + cv = cross_validate(pipe, X, y, cv=5, + scoring=['accuracy', 'f1', 'roc_auc'], n_jobs=-1) + results[name] = {k: v.mean() for k, v in cv.items()} + print(f"{name}: F1={cv['test_f1'].mean():.3f}") + +pd.DataFrame(results).T.sort_values('test_f1', ascending=False)
+ +

4. Hyperparameter Tuning with Optuna

+
import optuna + +def objective(trial): + params = { + 'n_estimators': trial.suggest_int('n_estimators', 50, 500), + 'max_depth': trial.suggest_int('max_depth', 3, 15), + 'learning_rate': trial.suggest_float('lr', 1e-3, 0.3, log=True), + 'subsample': trial.suggest_float('subsample', 0.6, 1.0) + } + model = XGBClassifier(**params) + score = cross_val_score(model, X, y, cv=5, scoring='f1').mean() + return score + +study = optuna.create_study(direction='maximize') +study.optimize(objective, n_trials=100) +print(f"Best F1: {study.best_value:.3f}") +print(f"Best params: {study.best_params}")
+ +

5. Save & Load Pipeline

+
import joblib +from datetime import datetime + +# Save entire pipeline (includes preprocessing!) +version = datetime.now().strftime('%Y%m%d_%H%M') +joblib.dump(pipe, f'models/pipeline_{version}.pkl') + +# Load and predict +pipe = joblib.load('models/pipeline_20240315_1430.pkl') +predictions = pipe.predict(new_data) # Preprocessing included!
+
`, + interview: ` +
+

🎯 Scikit-learn Interview Questions

+
Q1: What is data leakage?

Answer: Test set info influencing training. Common: fitting scaler before split. Fix: Pipeline ensures fit only on train folds.

+
Q2: Pipeline vs ColumnTransformer?

Answer: Pipeline: sequential (A→B→C). ColumnTransformer: parallel branches (different processing per column type). Usually CT inside Pipeline.

+
Q3: Which cross-validation when?

Answer: KFold: general. Stratified: imbalanced. TimeSeriesSplit: temporal. GroupKFold: grouped data.

+
Q4: Grid vs Random vs Bayesian?

Answer: Grid: exhaustive, exponential. Random: better for many params. Bayesian (Optuna): learns, most efficient for expensive models.

+
Q5: Custom transformer?

Answer: BaseEstimator + TransformerMixin. Implement fit(X,y) and transform(X). TransformerMixin gives fit_transform free.

+
Q6: How to handle imbalanced data?

Answer: (1) class_weight='balanced'. (2) SMOTE oversampling. (3) Adjust threshold. (4) Use F1/AUC not accuracy. (5) BalancedRandomForest.

+
Q7: When to use which model?

Answer: Tabular: gradient boosting (XGBoost/LightGBM). Small data: Logistic/SVM. Interpretability: Logistic/trees. Speed: LightGBM. Baseline: Random Forest.

+
Q8: fit() vs transform() vs predict()?

Answer: fit: learn params from data. transform: apply params. predict: generate predictions. fit on train only, transform/predict on both.

+
` +}, + +"pytorch": { + concepts: ` +
+

πŸ”₯ Deep Learning with PyTorch β€” Complete Guide

+ +
+
⚑ PyTorch Philosophy: Define-by-Run
+
PyTorch builds the computational graph dynamically as operations execute (eager mode). Debug with print(), breakpoints, standard Python control flow.
+
+ +

1. Tensors β€” The Foundation

+ + + + + + + + +
ConceptWhatKey Point
TensorN-dimensional arrayLike NumPy but GPU-capable
requires_gradTrack for autogradOnly for learnable params
deviceCPU or CUDA.to('cuda') moves to GPU
.detach()Stop gradient trackingUse for inference/metrics
.item()Extract scalarUse for logging loss
.contiguous()Ensure contiguous memoryRequired after transpose/permute
+ +

2. Autograd β€” How Backpropagation Works

+
+
🧠 Computational Graph (DAG)
+
When requires_grad=True, every operation is recorded. Each tensor stores grad_fn. .backward() traverses graph in reverse (chain rule). Graph destroyed after backward() unless retain_graph=True. Gradients ACCUMULATE β€” must optimizer.zero_grad() before each backward.
+
+ +

3. nn.Module β€” Building Blocks

+

Every model inherits nn.Module. Layers in __init__, computation in forward(). model.train()/model.eval() toggle BatchNorm/Dropout. model.parameters() for optimizer. model.state_dict() for save/load. Use nn.Sequential for simple stacks, nn.ModuleList/nn.ModuleDict for dynamic architectures.

+ +

4. Training Loop β€” The Standard Pattern

+

(1) Forward pass β†’ (2) Compute loss β†’ (3) optimizer.zero_grad() β†’ (4) loss.backward() β†’ (5) optimizer.step(). Add: gradient clipping, LR scheduling, mixed precision, logging, checkpointing.

+ +

5. Custom Datasets & DataLoaders

+

Dataset: override __len__ and __getitem__. DataLoader: batching, shuffling, multi-worker. num_workers>0 for parallel loading. pin_memory=True for faster GPU transfer. Use collate_fn for variable-length sequences.

+ +

6. Learning Rate Scheduling

+ + + + + + + +
SchedulerStrategyWhen
StepLRDecay every N epochsSimple baseline
CosineAnnealingLRCosine decayStandard for vision
OneCycleLRWarmup + decayBest for fast training
ReduceLROnPlateauDecay on stallWhen loss plateaus
LinearLRLinear warmupTransformer models
+ +

7. Mixed Precision Training (AMP)

+

torch.cuda.amp: forward in float16 (2x faster), gradients in float32. GradScaler prevents underflow. 2-3x speedup. Standard practice for any GPU training.

+ +

8. Transfer Learning Patterns

+

Load pretrained β†’ Freeze base β†’ Replace head β†’ Fine-tune with smaller LR. Discriminative LR: lower LR for earlier layers. Progressive unfreezing: unfreeze layers one at a time. Both work better than fine-tuning everything at once.

+ +

9. Distributed Training (DDP)

+

DistributedDataParallel: each GPU runs model copy, gradients averaged via all-reduce. Near-linear scaling. Use torchrun to launch. DistributedSampler for data splitting.

+ +

10. Debugging & Profiling

+ + + + + + + +
ToolPurpose
register_forward_hookView intermediate activations
register_backward_hookMonitor gradient magnitudes
torch.profilerGPU/CPU profiling
torch.cuda.memory_summary()GPU memory debugging
detect_anomaly()Find NaN/Inf sources
+ +

11. torch.compile (2.x)

+

JIT compiles model for 30-60% speedup. model = torch.compile(model). Uses TorchDynamo + Triton. Works on existing code. The future of PyTorch performance.

+
`, + code: ` +
+

πŸ’» PyTorch Project Code

+ +

1. Complete Training Framework

+
import torch +import torch.nn as nn +from torch.utils.data import DataLoader + +class Trainer: + def __init__(self, model, optimizer, criterion, device='cuda'): + self.model = model.to(device) + self.optimizer = optimizer + self.criterion = criterion + self.device = device + self.history = {'train_loss': [], 'val_loss': []} + + def train_epoch(self, loader): + self.model.train() + total_loss = 0 + for X, y in loader: + X, y = X.to(self.device), y.to(self.device) + self.optimizer.zero_grad() + loss = self.criterion(self.model(X), y) + loss.backward() + torch.nn.utils.clip_grad_norm_(self.model.parameters(), 1.0) + self.optimizer.step() + total_loss += loss.item() * len(X) + return total_loss / len(loader.dataset) + + @torch.no_grad() + def evaluate(self, loader): + self.model.eval() + total_loss = 0 + for X, y in loader: + X, y = X.to(self.device), y.to(self.device) + total_loss += self.criterion(self.model(X), y).item() * len(X) + return total_loss / len(loader.dataset) + + def fit(self, train_loader, val_loader, epochs, patience=5): + best_loss = float('inf') + wait = 0 + for epoch in range(epochs): + train_loss = self.train_epoch(train_loader) + val_loss = self.evaluate(val_loader) + self.history['train_loss'].append(train_loss) + self.history['val_loss'].append(val_loss) + print(f"Epoch {epoch+1}: train={train_loss:.4f} val={val_loss:.4f}") + if val_loss < best_loss: + best_loss = val_loss + torch.save(self.model.state_dict(), 'best_model.pt') + wait = 0 + else: + wait += 1 + if wait >= patience: + print("Early stopping!") + break
+ +

2. Custom Dataset for Any Tabular Data

+
class TabularDataset(torch.utils.data.Dataset): + def __init__(self, df, target, cat_cols=None, num_cols=None): + self.target = torch.FloatTensor(df[target].values) + self.num = torch.FloatTensor(df[num_cols].values) if num_cols else None + self.cat = torch.LongTensor(df[cat_cols].values) if cat_cols else None + + def __len__(self): + return len(self.target) + + def __getitem__(self, idx): + x = {} + if self.num is not None: x['num'] = self.num[idx] + if self.cat is not None: x['cat'] = self.cat[idx] + return x, self.target[idx]
+ +

3. Mixed Precision + Gradient Accumulation

+
from torch.cuda.amp import autocast, GradScaler + +scaler = GradScaler() +accum_steps = 4 # Effective batch = batch_size Γ— 4 + +for i, (X, y) in enumerate(loader): + with autocast(): + loss = model(X.cuda(), y.cuda()) / accum_steps + scaler.scale(loss).backward() + + if (i + 1) % accum_steps == 0: + scaler.unscale_(optimizer) + torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) + scaler.step(optimizer) + scaler.update() + optimizer.zero_grad()
+ +

4. Transfer Learning

+
import torchvision.models as models + +model = models.resnet50(weights='IMAGENET1K_V2') +model.requires_grad_(False) # Freeze all +model.fc = nn.Sequential( + nn.Dropout(0.3), + nn.Linear(2048, 512), + nn.ReLU(), + nn.Linear(512, num_classes) +) + +# Discriminative LR: lower for pretrained, higher for new head +optimizer = torch.optim.AdamW([ + {'params': model.layer4.parameters(), 'lr': 1e-5}, + {'params': model.fc.parameters(), 'lr': 1e-3} +])
+ +

5. Model Save/Load Best Practices

+
# Save everything for resuming training +checkpoint = { + 'epoch': epoch, + 'model_state': model.state_dict(), + 'optimizer_state': optimizer.state_dict(), + 'scheduler_state': scheduler.state_dict(), + 'best_loss': best_loss, + 'config': config +} +torch.save(checkpoint, 'checkpoint.pt') + +# Resume training +ckpt = torch.load('checkpoint.pt', map_location=device) +model.load_state_dict(ckpt['model_state']) +optimizer.load_state_dict(ckpt['optimizer_state'])
+
`, + interview: ` +
+

🎯 PyTorch Interview Questions

+
Q1: How does autograd work?

Answer: Records ops in DAG. .backward() traverses reverse, chain rule. Graph destroyed after backward. Dynamic = rebuilt each forward.

+
Q2: Why zero_grad()?

Answer: Gradients accumulate. Without zeroing, previous batch adds to current. Intentional: enables gradient accumulation for larger effective batch.

+
Q3: .detach() vs torch.no_grad()?

Answer: detach(): single tensor, shares data. no_grad(): context manager for all ops inside, saves memory. Use no_grad() for inference.

+
Q4: How to debug vanishing gradients?

Answer: (1) Backward hooks for gradient magnitudes. (2) clip_grad_norm_. (3) TensorBoard histograms. (4) BatchNorm/LayerNorm. (5) Skip connections.

+
Q5: DataLoader num_workers?

Answer: Rule: 4 Γ— num_gpus. Too many = CPU overhead. pin_memory=True for faster transfers. Profile to find sweet spot.

+
Q6: torch.compile vs eager?

Answer: compile JITs model via TorchDynamo+Triton. 30-60% faster. One line change. The future of PyTorch performance.

+
Q7: How to save/load models?

Answer: state_dict (weights only) vs full checkpoint (weights + optimizer + epoch). Use state_dict for inference, checkpoint for resuming.

+
Q8: Mixed precision β€” how and why?

Answer: autocast(fp16 forward) + GradScaler(fp32 grads). 2-3x speedup. Minimal accuracy loss. Standard for GPU training.

+
` +}, + +"tensorflow": { + concepts: ` +
+

🧠 TensorFlow & Keras β€” Complete Guide

+ +
+
⚑ TF2 = Eager by Default + @tf.function for Speed
+
TF2 defaults to eager mode (like PyTorch). @tf.function compiles to graph for production. Keras is the official API. TF handles full lifecycle: train β†’ save β†’ serve β†’ monitor.
+
+ +

1. Three Model APIs

+ + + + + +
APIUse CaseFlexibility
SequentialLinear stackLow
FunctionalMulti-input/output, branchingMedium (recommended)
SubclassingCustom forward logicHigh
+ +

2. tf.data Pipeline

+

Chains transformations lazily. .map(), .batch(), .shuffle(), .prefetch(AUTOTUNE). Prefetching overlaps loading with GPU execution. .cache() for small datasets. .interleave() for reading multiple files. TFRecord format for large datasets.

+ +

3. Callbacks β€” Training Hooks

+ + + + + + + + +
CallbackPurpose
ModelCheckpointSave best model
EarlyStoppingStop when metric plateaus
ReduceLROnPlateauReduce LR when stuck
TensorBoardVisualize metrics
CSVLoggerLog to CSV
LambdaCallbackCustom per-epoch logic
+ +

4. GradientTape β€” Custom Training

+

Record ops β†’ compute gradients β†’ apply. Use for: GANs, RL, custom losses, gradient penalty, multi-loss weighting. Same concept as PyTorch's manual loop.

+ +

5. @tf.function β€” Production Speed

+

Trace Python β†’ TF graph. Benefits: optimized execution, XLA, export. Gotchas: Python side effects only during tracing. Use tf.print() in graphs.

+ +

6. SavedModel β€” Universal Deployment

+

model.save('path') exports architecture + weights + computation. Ready for: TF Serving (production), TF Lite (mobile), TF.js (browser). One model, any platform.

+ +

7. Keras Tuner β€” Automated Hyperparameter Search

+

Build model function β†’ Tuner searches space. Strategies: Random, Hyperband, Bayesian. Integrates with TensorBoard. Alternative to Optuna for Keras models.

+ +

8. TF vs PyTorch β€” Decision Guide

+ + + + + + + +
Choose TF WhenChoose PyTorch When
Production deployment at scaleResearch & prototyping
Mobile (TFLite mature)Hugging Face ecosystem
TPU trainingGPU research
Edge devicesCustom architectures
Browser (TF.js)Academic papers
+
`, + code: ` +
+

πŸ’» TensorFlow Project Code

+ +

1. Functional API β€” Multi-Input Model

+
import tensorflow as tf +from tensorflow import keras + +text_input = keras.Input(shape=(100,), name='text') +num_input = keras.Input(shape=(5,), name='features') + +x1 = keras.layers.Embedding(10000, 64)(text_input) +x1 = keras.layers.GlobalAveragePooling1D()(x1) +x2 = keras.layers.Dense(32, activation='relu')(num_input) + +combined = keras.layers.Concatenate()([x1, x2]) +x = keras.layers.Dense(64, activation='relu')(combined) +x = keras.layers.Dropout(0.3)(x) +output = keras.layers.Dense(1, activation='sigmoid')(x) +model = keras.Model(inputs=[text_input, num_input], outputs=output)
+ +

2. Training with Callbacks

+
callbacks = [ + keras.callbacks.ModelCheckpoint('best.keras', + monitor='val_loss', save_best_only=True), + keras.callbacks.EarlyStopping(patience=5, + restore_best_weights=True), + keras.callbacks.ReduceLROnPlateau(factor=0.5, patience=3), + keras.callbacks.TensorBoard(log_dir='./logs') +] + +model.compile(optimizer='adam', loss='binary_crossentropy', + metrics=['accuracy', keras.metrics.AUC()]) +model.fit(X_train, y_train, epochs=50, + validation_split=0.2, callbacks=callbacks)
+ +

3. Custom Training Loop (GradientTape)

+
@tf.function +def train_step(model, X, y, optimizer, loss_fn): + with tf.GradientTape() as tape: + preds = model(X, training=True) + loss = loss_fn(y, preds) + grads = tape.gradient(loss, model.trainable_variables) + optimizer.apply_gradients(zip(grads, model.trainable_variables)) + return loss
+ +

4. tf.data Pipeline

+
dataset = ( + tf.data.Dataset.from_tensor_slices((X, y)) + .shuffle(10000) + .batch(64) + .map(lambda x, y: (augment(x), y), + num_parallel_calls=tf.data.AUTOTUNE) + .prefetch(tf.data.AUTOTUNE) +)
+ +

5. Custom Callback for Experiment Logging

+
class ExperimentLogger(keras.callbacks.Callback): + def __init__(self, log_path): + self.log_path = log_path + self.logs_data = [] + + def on_epoch_end(self, epoch, logs=None): + self.logs_data.append({'epoch': epoch, **logs}) + pd.DataFrame(self.logs_data).to_csv(self.log_path, index=False) + if logs['val_loss'] > logs['loss'] * 1.5: + print(f"⚠️ Possible overfitting at epoch {epoch}")
+
`, + interview: ` +
+

οΏ½οΏ½οΏ½ TensorFlow Interview Questions

+
Q1: Sequential vs Functional vs Subclassing?

Answer: Sequential: linear. Functional: multi-I/O, branching. Subclassing: full Python control. Use Functional for most projects.

+
Q2: What does @tf.function do?

Answer: Traces Python β†’ TF graph. Faster, XLA, export. Gotcha: side effects only during tracing.

+
Q3: tf.data performance?

Answer: prefetch(AUTOTUNE) overlaps loading+training. cache() for small data. interleave() for multiple files.

+
Q4: EarlyStopping config?

Answer: monitor='val_loss', patience=5-10, restore_best_weights=True. Combine with ReduceLROnPlateau.

+
Q5: When GradientTape?

Answer: GANs, RL, custom gradients, multi-loss. When .fit() is too restrictive.

+
Q6: TF vs PyTorch?

Answer: TF: deployment (Serving, Lite, JS), mobile. PyTorch: research, HuggingFace. Both converging.

+
Q7: How to deploy TF model?

Answer: SavedModel β†’ TF Serving (REST/gRPC), TFLite (mobile), TF.js (browser). Docker + TF Serving for production.

+
` +}, + +"production": { + concepts: ` +
+

πŸ“¦ Production Python β€” Complete Engineering Guide

+ +
+
⚑ Production = Reliability + Reproducibility + Observability
+
Production code must be tested (pytest), typed (mypy), logged (structured), packaged (pyproject.toml), containerized (Docker), and monitored (metrics). The gap between notebook and production is enormous.
+
+ +

1. pytest β€” Professional Testing

+ + + + + + + + + +
FeaturePurposeExample
fixturesReusable test setup@pytest.fixture
parametrizeMany inputs, same test@pytest.mark.parametrize
conftest.pyShared fixturesDB connections, mock data
monkeypatchOverride functions/envMock API calls
tmp_pathTemp directoryTest file I/O
markersTag testspytest -m "not slow"
coverageMeasure test coveragepytest --cov
+ +

2. Testing ML Code

+
+
🎯 What to Test in ML
+
+ Unit: data transforms, feature engineering, loss functions.
+ Integration: full pipeline end-to-end.
+ Model: output shape, range, determinism with seed.
+ Data: schema validation, distribution shifts, missing patterns. +
+
+ +

3. Logging Best Practices

+ + + + + + + +
LevelWhen
DEBUGTensor shapes, intermediate values
INFOTraining started, epoch complete
WARNINGUnexpected but handled (fallback used)
ERRORModel load failure, API error
CRITICALOOM, GPU crash
+

Never use print(). Use structured logging (JSON format) for production β€” parseable by log aggregators (ELK, Datadog).

+ +

4. FastAPI for Model Serving

+

Modern async framework. Auto-generates OpenAPI docs. Pydantic validation. Deploy with Uvicorn + Docker. Add: health checks, input validation, error handling, rate limiting, request logging.

+ +

5. Docker for ML

+

Containerize everything: Python, CUDA, dependencies. Multi-stage builds: builder (install) β†’ runtime (slim). Pin versions. NVIDIA Container Toolkit for GPU. docker compose for multi-service (API + Redis + DB).

+ +

6. pyproject.toml β€” Modern Packaging

+

Replaces setup.py/cfg. Project metadata, dependencies, build system, tool configs (pytest, mypy, ruff). [project.optional-dependencies] for dev/test extras. pip install -e ".[dev]" for editable installs.

+ +

7. Configuration Management

+ + + + + +
ToolBest ForKey Feature
HydraML experimentsYAML, CLI overrides, multi-run
Pydantic SettingsApp configEnv var loading, validation
python-dotenvSimple projects.env file loading
+ +

8. CI/CD for ML

+

GitHub Actions: lint (ruff) β†’ type check (mypy) β†’ test (pytest) β†’ build (Docker) β†’ deploy. Add model validation gate: new model must beat baseline on test metrics before deployment.

+ +

9. Code Quality Tools

+ + + + + + + +
ToolPurpose
ruffFast linter + formatter (replaces black, isort, flake8)
mypyStatic type checking
pre-commitGit hooks for auto-formatting
pytest-covTest coverage
banditSecurity linting
+ +

10. MLOps β€” Model Lifecycle

+ + + + + + + +
ToolPurpose
MLflowExperiment tracking, model registry
DVCData versioning (like Git for data)
Weights & BiasesExperiment tracking, visualization
EvidentlyData drift & model monitoring
Great ExpectationsData validation
+ +

11. Database for ML Projects

+ + + + + + + +
DBUse CasePython Library
SQLiteLocal, small data, prototypingsqlite3 (built-in)
PostgreSQLProduction, ACID, JSONpsycopg2, SQLAlchemy
RedisCaching, queues, sessionsredis-py
MongoDBFlexible schema, documentspymongo
Pinecone/WeaviateVector search (embeddings)Official SDKs
+
`, + code: ` +
+

πŸ’» Production Python Project Code

+ +

1. pytest β€” Complete ML Testing

+
import pytest +import numpy as np + +# conftest.py β€” shared fixtures +@pytest.fixture +def sample_data(): + np.random.seed(42) + X = np.random.randn(100, 10) + y = np.random.randint(0, 2, 100) + return X, y + +@pytest.fixture +def trained_model(sample_data): + X, y = sample_data + model = RandomForestClassifier(n_estimators=10) + model.fit(X, y) + return model + +# Test multiple models with one function +@pytest.mark.parametrize("model_cls", [ + LogisticRegression, RandomForestClassifier, GradientBoostingClassifier +]) +def test_model_output(model_cls, sample_data): + X, y = sample_data + model = model_cls() + model.fit(X, y) + preds = model.predict(X) + assert preds.shape == y.shape + assert set(np.unique(preds)).issubset({0, 1}) + +# Test data pipeline +def test_pipeline_no_leakage(sample_data, pipeline): + X, y = sample_data + scores = cross_val_score(pipeline, X, y, cv=3) + assert all(s >= 0 and s <= 1 for s in scores)
+ +

2. Structured Logging

+
import logging, json, sys + +class JSONFormatter(logging.Formatter): + def format(self, record): + log = { + 'timestamp': self.formatTime(record), + 'level': record.levelname, + 'module': record.module, + 'message': record.getMessage() + } + if record.exc_info: + log['exception'] = self.formatException(record.exc_info) + return json.dumps(log) + +def setup_logging(level=logging.INFO): + handler = logging.StreamHandler(sys.stdout) + handler.setFormatter(JSONFormatter()) + logging.root.handlers = [handler] + logging.root.setLevel(level) + +logger = logging.getLogger(__name__) +logger.info("Training started", extra={'model': 'xgb'})
+ +

3. FastAPI β€” Complete ML API

+
from fastapi import FastAPI, HTTPException +from pydantic import BaseModel, Field +import joblib, numpy as np + +app = FastAPI(title="ML Prediction API") +model = None + +@app.on_event("startup") +def load_model(): + global model + model = joblib.load("models/pipeline.pkl") + +class PredictRequest(BaseModel): + features: list[float] = Field(..., min_length=1) + +class PredictResponse(BaseModel): + prediction: int + probability: float + model_version: str + +@app.post("/predict", response_model=PredictResponse) +async def predict(req: PredictRequest): + try: + X = np.array(req.features).reshape(1, -1) + pred = model.predict(X)[0] + proba = model.predict_proba(X)[0].max() + return PredictResponse( + prediction=int(pred), probability=float(proba), + model_version="v2.1" + ) + except Exception as e: + raise HTTPException(500, detail=str(e)) + +@app.get("/health") +async def health(): + return {"status": "healthy", "model_loaded": model is not None}
+ +

4. Dockerfile for ML

+
# Multi-stage build +FROM python:3.11-slim AS builder +COPY requirements.txt . +RUN pip install --no-cache-dir --target=/deps -r requirements.txt + +FROM python:3.11-slim +COPY --from=builder /deps /usr/local/lib/python3.11/site-packages +COPY src/ /app/src/ +COPY models/ /app/models/ +WORKDIR /app +EXPOSE 8000 +HEALTHCHECK CMD curl -f http://localhost:8000/health || exit 1 +CMD ["uvicorn", "src.api:app", "--host", "0.0.0.0", "--port", "8000"]
+ +

5. Makefile for Project Commands

+
# Makefile β€” run from project root +.PHONY: install test lint train serve + +install: + pip install -e ".[dev]" + +test: + pytest tests/ -v --cov=src --cov-report=term-missing + +lint: + ruff check src/ tests/ + mypy src/ + +train: + python -m src.training.train --config configs/default.yaml + +serve: + uvicorn src.api:app --reload --port 8000
+ +

6. MLflow Experiment Tracking

+
import mlflow + +mlflow.set_experiment("customer_churn") +with mlflow.start_run(): + mlflow.log_params({"model": "xgb", "lr": 0.01}) + model.fit(X_train, y_train) + mlflow.log_metrics({"f1": f1, "auc": auc_score}) + mlflow.sklearn.log_model(pipeline, "model")
+
`, + interview: ` +
+

🎯 Production Python Interview Questions

+
Q1: How to test ML code?

Answer: Unit: transforms, features. Integration: full pipeline. Model: shape, range, determinism. Data: schema, distributions. Use pytest fixtures.

+
Q2: print() vs logging?

Answer: Logging: levels, file output, structured (JSON), zero cost when disabled, thread-safe. Print: none. Production = logging.

+
Q3: How to serve ML model?

Answer: FastAPI + Docker. Load model at startup. Add health checks, validation, error handling, logging. Async for throughput.

+
Q4: pyproject.toml vs setup.py?

Answer: pyproject.toml: modern standard, all tools in one file. Pin deps. Use optional deps for dev/test. pip install -e ".[dev]".

+
Q5: ML experiment configs?

Answer: Hydra: YAML + CLI overrides + multi-run sweeps. Version control configs. Never hardcode hyperparams.

+
Q6: CI/CD for ML?

Answer: lint β†’ type-check β†’ test β†’ build β†’ deploy. Model validation gate: must beat baseline. GitHub Actions + Docker.

+
Q7: How to handle model versioning?

Answer: MLflow model registry. DVC for data. Git for code. timestamp + metrics in model filename. A/B testing for rollout.

+
Q8: What is data drift?

Answer: Input distribution changes post-deployment. Detect: Evidently, statistical tests. Monitor: feature distributions, prediction distributions. Retrain trigger.

+
` +}, + +"optimization": { + concepts: ` +
+

⚑ Performance & Optimization β€” Complete Guide

+ +
+
⚑ The Optimization Hierarchy
+
1. Algorithm (O(n²)→O(n log n)) > 2. Data structures (list→set) > 3. Vectorization (NumPy) > 4. Compilation (Numba/Cython) > 5. Parallelization (multiprocessing) > 6. Hardware (GPU). Always start from the top.
+
+ +

1. Profiling β€” Measure First

+ + + + + + + + +
ToolTypeWhenOverhead
cProfileFunction-levelFind slow functions~2x
line_profilerLine-by-lineOptimize hot functionHigher
Py-SpySamplingProduction profilingNear zero
tracemallocMemoryFind leaksLow
memory_profilerLine memoryMemory per lineHigh
scaleneCPU+Memory+GPUComprehensiveLow
+ +

2. The GIL β€” What Every Python Dev Must Know

+
+
πŸ”’ Global Interpreter Lock
+
GIL prevents true multi-threading for CPU-bound Python. BUT: NumPy, Pandas, scikit-learn release the GIL during C operations. Python 3.13: experimental free-threaded CPython (no-GIL).
+
+ + + + + + +
Task TypeSolutionWhy
I/O-boundasyncio / threadingGIL released during I/O
CPU-bound PythonmultiprocessingSeparate processes, separate GIL
CPU-bound NumPythreading OKNumPy releases GIL
Many tasksconcurrent.futuresSimple Pool interface
+ +

3. Numba β€” JIT Compilation

+

@numba.jit(nopython=True): compile to machine code. 10-100x speedup for loops. Supports NumPy, math. @numba.vectorize: custom ufuncs. @cuda.jit: GPU kernels. Best for: tight loops that can't be vectorized.

+ +

4. Dask β€” Parallel Computing

+

Pandas/NumPy API for data bigger than memory. dask.dataframe, dask.array, dask.delayed. Lazy execution. Task graph scheduler. Scales from laptop to cluster. Alternative: Polars for single-machine parallel.

+ +

5. Ray β€” Distributed ML

+

General-purpose distributed framework. Ray Tune (hyperparameter tuning), Ray Serve (model serving), Ray Data. Easier than Dask for ML. Used by OpenAI, Uber.

+ +

6. Memory Optimization

+ + +

7. Caching Strategies

+ + + + + + + +
ToolScopeUse Case
@functools.lru_cacheIn-memory, functionExpensive computations
@functools.cacheUnbounded cachePure functions
joblib.MemoryDisk cacheData processing pipelines
RedisExternal cacheMulti-process, API responses
diskcachePure Python diskSimple persistent cache
+ +

8. Python 3.12-3.13 Performance

+

3.12: 5-15% faster, better errors, per-interpreter GIL. 3.13: Free-threaded (no-GIL experimental), JIT compiler (experimental). The future of Python performance is exciting.

+ +

9. Common Performance Anti-Patterns

+ + + + + + + + +
Anti-PatternFixSpeedup
for row in df.iterrows()Vectorized ops100-1000x
s += "text" in loop''.join(parts)100x
x in big_listx in big_set1000x
Python list of floatsNumPy array50-100x
Global imports in functionImport at topVariable
Not using built-inssum(), min()5-10x
+
`, + code: ` +
+

πŸ’» Performance Code Examples

+ +

1. Profiling Workflow

+
import cProfile, pstats + +# Profile and find bottlenecks +with cProfile.Profile() as pr: + result = expensive_pipeline(data) + +stats = pstats.Stats(pr) +stats.sort_stats('cumulative') +stats.print_stats(10) # Top 10 slow functions + +# Memory profiling +import tracemalloc +tracemalloc.start() +# ... process data ... +snapshot = tracemalloc.take_snapshot() +for stat in snapshot.statistics('filename')[:5]: + print(stat)
+ +

2. Numba JIT

+
import numba +import numpy as np + +@numba.jit(nopython=True) +def pairwise_distance(X): + n = X.shape[0] + D = np.empty((n, n)) + for i in range(n): + for j in range(i+1, n): + d = 0.0 + for k in range(X.shape[1]): + d += (X[i,k] - X[j,k]) ** 2 + D[i,j] = D[j,i] = d ** 0.5 + return D +# 100x faster than pure Python!
+ +

3. concurrent.futures β€” Parallel Processing

+
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor + +# CPU-bound: processes +with ProcessPoolExecutor(max_workers=8) as ex: + results = list(ex.map(process_chunk, data_chunks)) + +# I/O-bound: threads +with ThreadPoolExecutor(max_workers=32) as ex: + results = list(ex.map(fetch_url, urls))
+ +

4. Dask for Large Data

+
import dask.dataframe as dd + +# Read 100GB of CSVs β€” lazy! +ddf = dd.read_csv('data/*.csv') + +# Same Pandas API β€” but parallel +result = ( + ddf.groupby('category') + .agg({'revenue': 'sum', 'qty': 'mean'}) + .compute() # Only here does it execute +)
+ +

5. functools.lru_cache β€” Memoization

+
from functools import lru_cache + +@lru_cache(maxsize=1024) +def expensive_feature(customer_id: int) -> dict: + # DB query, computation, etc. + return compute_features(customer_id) + +# First call: computes. Second call: instant from cache +print(expensive_feature.cache_info()) # hits, misses, size
+ +

6. __slots__ for Memory

+
class Point: + __slots__ = ('x', 'y', 'z') + def __init__(self, x, y, z): + self.x, self.y, self.z = x, y, z + +# 1M instances: ~60MB vs ~160MB without __slots__ +points = [Point(i, i*2, i*3) for i in range(1_000_000)]
+ +

7. String Performance

+
# ❌ O(nΒ²) β€” creates new string each iteration +result = "" +for word in words: + result += word + " " + +# βœ… O(n) β€” single allocation at end +result = " ".join(words)
+
`, + interview: ` +
+

🎯 Performance Interview Questions

+
Q1: Why the GIL?

Answer: Simplifies reference counting. Makes single-threaded faster. Easier C extensions. Python 3.13 has experimental no-GIL mode.

+
Q2: Optimize nested loop?

Answer: (1) NumPy vectorize. (2) Numba JIT. (3) Cython. (4) multiprocessing if independent.

+
Q3: Threading vs multiprocessing?

Answer: Threading: I/O-bound (shared memory). Multiprocessing: CPU-bound (bypasses GIL). Downloads→threads. Matrix math→processes.

+
Q4: What is Numba?

Answer: JIT compiler: Python→machine code via LLVM. @jit(nopython=True). 10-100x for NumPy loops. No Pandas/strings.

+
Q5: How to profile Python?

Answer: cProfile: functions. line_profiler: lines. Py-Spy: production. tracemalloc: memory. scalene: all-in-one. Profile FIRST, optimize second.

+
Q6: Dask vs Ray vs Spark?

Answer: Dask: Pandas API, Python-native. Ray: ML-focused. Spark: JVM, TB+ data. Python ML: Dask/Ray. Big data ETL: Spark.

+
Q7: Top 3 Python performance tips?

Answer: (1) Use sets not lists for lookups. (2) NumPy not Python loops. (3) Generator expressions for memory. Bonus: lru_cache for expensive functions.

+
Q8: How does lru_cache work?

Answer: Hash-based memoization. Args must be hashable. maxsize=None for unlimited. cache_info() shows hits/misses. Perfect for pure functions.

+
` +} +}; +function renderDashboard() { + const grid = document.getElementById('modulesGrid'); + grid.innerHTML = modules.map(module => ` +
+
${module.icon}
+

${module.title}

+

${module.description}

+ ${module.category} +
+ `).join(''); +} + +// Show specific module +function showModule(moduleId) { + const module = modules.find(m => m.id === moduleId); + const content = MODULE_CONTENT[moduleId]; + + document.getElementById('dashboard').classList.remove('active'); + + const moduleHTML = ` +
+ +
+

${module.icon} ${module.title}

+

${module.description}

+
+ +
+ + + +
+ +
${content.concepts}
+
${content.code}
+
${content.interview}
+
+ `; + + document.getElementById('modulesContainer').innerHTML = moduleHTML; +} + +// Switch tabs +function switchTab(moduleId, tabName, e) { + const moduleEl = document.getElementById(`module-${moduleId}`); + + // Update tab buttons + moduleEl.querySelectorAll('.tab-btn').forEach(btn => btn.classList.remove('active')); + if (e && e.target) { + e.target.classList.add('active'); + } else { + // Fallback: find the button by tab name + const tabNames = ['concepts', 'code', 'interview']; + const idx = tabNames.indexOf(tabName); + if (idx !== -1) moduleEl.querySelectorAll('.tab-btn')[idx]?.classList.add('active'); + } + + // Update tab content + moduleEl.querySelectorAll('.tab').forEach(tab => tab.classList.remove('active')); + document.getElementById(`${moduleId}-${tabName}`).classList.add('active'); +} + +// Back to dashboard +function backToDashboard() { + document.querySelectorAll('.module').forEach(m => m.remove()); + document.getElementById('dashboard').classList.add('active'); +} + +// Initialize + +document.addEventListener('DOMContentLoaded', renderDashboard);