diff --git "a/Python/app.js" "b/Python/app.js" new file mode 100644--- /dev/null +++ "b/Python/app.js" @@ -0,0 +1,1876 @@ +const modules = [ + { + id: "python-fundamentals", + title: "Python Fundamentals for DS", + icon: "๐Ÿ", + category: "Foundations", + description: "Data structures, comprehensions, file I/O, virtual environments" + }, + { + id: "numpy", + title: "NumPy & Scientific Computing", + icon: "๐Ÿ”ข", + category: "Scientific", + description: "ndarrays, broadcasting, vectorization, linear algebra" + }, + { + id: "pandas", + title: "Pandas & Data Manipulation", + icon: "๐Ÿผ", + category: "Data Wrangling", + description: "DataFrames, groupby, pivot, time series, merging" + }, + { + id: "visualization", + title: "Data Visualization", + icon: "๐Ÿ“Š", + category: "Visualization", + description: "Matplotlib, Seaborn, Plotly โ€” from basics to publication-ready" + }, + { + id: "advanced-python", + title: "Advanced Python", + icon: "๐ŸŽฏ", + category: "Advanced", + description: "OOP, decorators, async, multiprocessing, type hints" + }, + { + id: "sklearn", + title: "Python for ML (Scikit-learn)", + icon: "๐Ÿค–", + category: "Machine Learning", + description: "Pipelines, transformers, cross-validation, hyperparameter tuning" + }, + { + id: "pytorch", + title: "Deep Learning with PyTorch", + icon: "๐Ÿ”ฅ", + category: "Deep Learning", + description: "Tensors, autograd, nn.Module, training loops, transfer learning" + }, + { + id: "tensorflow", + title: "TensorFlow & Keras", + icon: "๐Ÿง ", + category: "Deep Learning", + description: "Sequential/Functional API, callbacks, TensorBoard, deployment" + }, + { + id: "production", + title: "Production Python", + icon: "๐Ÿ“ฆ", + category: "Engineering", + description: "Testing, packaging, logging, FastAPI for model serving" + }, + { + id: "optimization", + title: "Performance & Optimization", + icon: "โšก", + category: "Optimization", + description: "Profiling, Numba, Cython, memory optimization, Dask" + } +]; + +const MODULE_CONTENT = { + "python-fundamentals": { + concepts: ` +
+

Python Data Structures for DS

+ + + + + + + + + +
TypeMutableOrderedHashableUse Case
listโœ“โœ“โœ—Sequential data, time series, feature lists
tupleโœ—โœ“โœ“Fixed records, dict keys, DataFrame rows
dictโœ“โœ“ (3.7+)โœ—Lookup tables, JSON, config, caches
setโœ“โœ—โœ—Unique values, membership testing O(1)
frozensetโœ—โœ—โœ“Immutable set, usable as dict keys
dequeโœ“โœ“โœ—O(1) append/pop both ends, sliding windows
+ +

๐Ÿง  Python Memory Model โ€” What No One Teaches You

+
+
โšก Everything Is An Object
+
+ In Python, every value is an object on the heap. Variables are just references (pointers) to objects. When you write a = [1, 2, 3], the list lives on the heap; a is a name that points to it. This is why b = a makes both point to the same list โ€” no copy is made. +
+
+

Reference Counting: Python uses reference counting + cyclic garbage collector. Each object tracks how many names point to it. When count hits 0, memory is freed immediately. This is why del doesn't always free memory โ€” it just decrements the reference count.

+

Integer Interning: Python caches integers from -5 to 256 and short strings. So a = 100; b = 100; a is b is True, but a = 1000; b = 1000; a is b may be False. Never use is for value comparison โ€” always use ==.

+ +

collections Module โ€” The Power Tools

+ + + + + + + + +
ClassPurposeWhy It Matters in DS
defaultdictDict with default factoryGroup data without KeyError: defaultdict(list)
CounterCount hashable objectsLabel distribution: Counter(y_train)
namedtupleLightweight immutable classReturn multiple values with names, not indices
OrderedDictDict remembering insertion orderLegacy (dicts are ordered in 3.7+), but useful for move_to_end()
dequeDouble-ended queueSliding window computations, BFS algorithms
ChainMapStack multiple dictsLayer config: defaults โ†’ env โ†’ CLI overrides
+ +

itertools โ€” Memory-Efficient Data Pipelines

+
+
๐Ÿ”„ Lazy Evaluation Is King
+
+ itertools functions return iterators, not lists. They consume O(1) memory regardless of input size. This matters when processing millions of records. +
+
+ + + + + + + + +
FunctionWhat It DoesDS Use Case
chain()Concatenate iterablesMerge multiple data files lazily
islice()Slice any iteratorTake first N records from generator
groupby()Group consecutive elementsProcess sorted log entries by date
product()Cartesian productGenerate hyperparameter grid
combinations()All r-length combosFeature interaction pairs
starmap()map() with unpacked argsApply function to paired data
+ +

pathlib โ€” Modern File Handling

+

Stop using os.path.join(). Use pathlib.Path โ€” it's object-oriented, cross-platform, and reads like English:

+ + +

Error Handling Patterns for Data Pipelines

+
+
โš ๏ธ Never Do This
+ Bare except: catches SystemExit and KeyboardInterrupt. Always catch specific exceptions. In DS pipelines, catch ValueError (bad data), FileNotFoundError (missing files), KeyError (missing columns). +
+

LBYL vs EAFP: Python prefers "Easier to Ask Forgiveness than Permission" (EAFP). Use try/except instead of checking conditions first. It's faster when exceptions are rare (which they usually are).

+ +

Virtual Environments โ€” Dependency Isolation

+ + + + + + +
ToolBest ForCreateKey Feature
venvSimple projectspython -m venv envBuilt-in, lightweight
condaDS/ML (C dependencies)conda create -n myenv python=3.11Handles non-Python deps (CUDA, MKL)
poetryModern packagingpoetry initLock files, deterministic builds
uvSpeed (Rust-based)uv venv10-100x faster than pip
+
+ `, + code: ` +
+

๐Ÿ’ป Essential Code Examples

+ +

collections In Action

+
+from collections import defaultdict, Counter, namedtuple, deque + +# defaultdict โ€” Group samples by label (no KeyError!) +samples_by_label = defaultdict(list) +for feature, label in zip(features, labels): + samples_by_label[label].append(feature) +# {'cat': [f1, f3], 'dog': [f2, f4]} โ€” no if/else needed + +# Counter โ€” Class distribution analysis +y_train = [0, 1, 1, 0, 1, 2, 0, 1] +dist = Counter(y_train) +print(dist) # Counter({1: 4, 0: 3, 2: 1}) +print(dist.most_common(2)) # [(1, 4), (0, 3)] + +# namedtuple โ€” Return multiple values with names +ModelResult = namedtuple('ModelResult', ['accuracy', 'precision', 'recall', 'f1']) +result = ModelResult(accuracy=0.95, precision=0.93, recall=0.91, f1=0.92) +print(result.accuracy) # 0.95 โ€” much clearer than result[0] + +# deque โ€” Sliding window for streaming data +window = deque(maxlen=5) +for value in data_stream: + window.append(value) + moving_avg = sum(window) / len(window) +
+ +

itertools for Data Pipelines

+
+from itertools import chain, islice, product, combinations + +# chain โ€” Merge multiple data files lazily (no memory explosion) +def read_csv_lines(filepath): + with open(filepath) as f: +next(f) # skip header +yield from f + +all_data = chain( + read_csv_lines('jan.csv'), + read_csv_lines('feb.csv'), + read_csv_lines('mar.csv') +) +# Processes millions of lines with O(1) memory! + +# product โ€” Generate hyperparameter grid +learning_rates = [0.001, 0.01, 0.1] +batch_sizes = [16, 32, 64] +dropouts = [0.1, 0.3, 0.5] +for lr, bs, do in product(learning_rates, batch_sizes, dropouts): + train_model(lr=lr, batch_size=bs, dropout=do) +# 27 combinations without nested loops + +# combinations โ€” Feature interaction pairs +features = ['age', 'income', 'score', 'tenure'] +for f1, f2 in combinations(features, 2): + df[f'{f1}_x_{f2}'] = df[f1] * df[f2] +# Creates: age_x_income, age_x_score, ... (6 pairs) +
+ +

pathlib โ€” Modern File Management

+
+from pathlib import Path + +# Build paths naturally (cross-platform) +data_dir = Path('data') / 'processed' / 'v2' +data_dir.mkdir(parents=True, exist_ok=True) + +# Find all CSV files recursively +csv_files = list(data_dir.glob('**/*.csv')) +print(f"Found {len(csv_files)} CSV files") + +# Parse file names without string hacking +for f in csv_files: + print(f"Name: {f.stem}, Extension: {f.suffix}, Parent: {f.parent}") + +# Read/write without open() +config = Path('config.json').read_text() +Path('output.txt').write_text('Results: 95.2% accuracy') +
+ +

Advanced Comprehensions & Unpacking

+
+# Nested comprehension โ€” Flatten list of lists +nested = [[1, 2], [3, 4], [5, 6]] +flat = [x for sublist in nested for x in sublist] +# [1, 2, 3, 4, 5, 6] + +# Dict comprehension โ€” Invert a mapping +label_to_id = {'cat': 0, 'dog': 1, 'bird': 2} +id_to_label = {v: k for k, v in label_to_id.items()} + +# Set comprehension โ€” Unique words from documents +docs = ["hello world", "world of python"] +vocab = {word for doc in docs for word in doc.split()} + +# Walrus operator (:=) โ€” Assign + use in expression (3.8+) +if (n := len(data)) > 1000: + print(f"Large dataset: {n} samples") + +# Extended unpacking โ€” Split data elegantly +first, *middle, last = sorted(scores) +print(f"Min: {first}, Max: {last}, Middle: {middle}") +
+ +

Robust Error Handling for Pipelines

+
+import logging +logger = logging.getLogger(__name__) + +def load_and_validate(filepath): + """Production-grade data loading with proper error handling.""" + try: +df = pd.read_csv(filepath) + except FileNotFoundError: +logger.error(f"File not found: {filepath}") +raise + except pd.errors.EmptyDataError: +logger.warning(f"Empty file: {filepath}") +return pd.DataFrame() + except pd.errors.ParserError as e: +logger.error(f"Parse error in {filepath}: {e}") +raise ValueError(f"Corrupted CSV: {filepath}") from e + + # Validate required columns + required = {'id', 'target', 'timestamp'} + missing = required - set(df.columns) + if missing: +raise KeyError(f"Missing columns: {missing}") + + return df +
+
+ `, + interview: ` +
+

๐ŸŽฏ Interview Questions

+ +
+ Q1: What's the difference between a list and a tuple? When would you use each in DS? +

Answer: Lists are mutable, tuples immutable. But the deeper answer: tuples are hashable (can be dict keys), use less memory (no over-allocation), and signal intent ("this shouldn't change"). Use tuples for (lat, lon) pairs, function return values, dict keys for caching. Use lists for feature collections that grow.

+
+ +
+ Q2: How does Python's GIL affect data science workflows? +

Answer: The GIL prevents true multi-threading for CPU-bound tasks. But here's what most people miss: NumPy, Pandas, and scikit-learn release the GIL during C-level computations. So vectorized operations ARE parallel at the C level. For pure Python CPU work, use multiprocessing. For I/O (API calls, file reads), threading works fine because the GIL is released during I/O waits.

+
+ +
+ Q3: Explain the difference between is and ==. Why does this matter? +

Answer: == checks value equality (__eq__). is checks identity (same memory address). Python interns small integers (-5 to 256) and some strings, so 300 is 300 may be False. Always use == for values. Only use is for None checks: if x is None.

+
+ +
+ Q4: How would you handle a 10GB CSV that doesn't fit in memory? +

Answer: 5 strategies, from simplest to most powerful: (1) pd.read_csv(chunksize=50000) โ€” process in batches, (2) usecols=['needed_cols'] โ€” load only what you need, (3) dtype={'col': 'int32'} โ€” use smaller types, (4) Dask โ€” lazy Pandas-like API, (5) DuckDB โ€” SQL on CSV files with zero memory overhead.

+
+ +
+ Q5: What's the time complexity of dict lookup vs list search? Why? +

Answer: Dict: O(1) average via hash tables (Python's dict uses open addressing). List: O(n) linear scan. Internally, dict hashes the key to compute a slot index, then handles collisions via probing. Sets use the same mechanism. This is why x in my_set is fast but x in my_list is slow.

+
+ +
+ Q6: Explain shallow vs deep copy. Give a real DS scenario where this matters. +

Answer: copy.copy() copies outer container but shares inner objects. copy.deepcopy() recursively copies everything. Real scenario: You have a list of dicts (config per experiment). Shallow copy means modifying one experiment's config changes all of them. Deep copy gives independent configs. Pandas .copy() is deep by default โ€” but df2 = df is NOT a copy at all.

+
+ +
+ Q7: What is a defaultdict and when would you use it over a regular dict? +

Answer: defaultdict(factory) auto-creates default values for missing keys. Use defaultdict(list) to group items without if key not in dict checks. Use defaultdict(int) to count. It's cleaner and ~20% faster than dict.setdefault() for grouping operations in data processing.

+
+ +
+ Q8: What are generators and why are they critical for large-scale data processing? +

Answer: Generators yield values one at a time using yield, consuming O(1) memory regardless of data size. A list of 1 billion items = ~8GB RAM. A generator of 1 billion items = ~100 bytes. Critical for: reading large files, streaming data, batch training. yield from delegates to sub-generators.

+
+ +
+ Q9: How would you remove duplicates from a list while preserving order? +

Answer: list(dict.fromkeys(my_list)) โ€” uses dict's insertion-order guarantee (3.7+), runs in O(n). Old approach: seen = set(); [x for x in lst if not (x in seen or seen.add(x))]. For DataFrames: df.drop_duplicates(subset=['key_col']).

+
+ +
+ Q10: Explain Python's garbage collection mechanism. +

Answer: Two mechanisms: (1) Reference counting โ€” each object has a count; freed when count hits 0. Immediate cleanup. (2) Cyclic garbage collector โ€” detects reference cycles (A โ†’ B โ†’ A) that refcount can't handle. Runs periodically on generations (gen0, gen1, gen2). You can force it with gc.collect() โ€” useful after deleting large ML models.

+
+ +
+ Q11: What's the difference between __str__ and __repr__? +

Answer: __str__ is for end users (readable), __repr__ is for developers (unambiguous, ideally eval-able). If only one is defined, implement __repr__ โ€” Python falls back to it for str() too. In ML: __repr__ should show model params: LinearRegression(lr=0.01, reg=l2).

+
+ +
+ Q12: How does *args and **kwargs help in ML code? +

Answer: They enable flexible function signatures. *args: variable positional args (multiple datasets). **kwargs: variable keyword args (hyperparameters). Essential for: wrapper functions, decorators, scikit-learn's set_params(**params), and model.fit(X, y, **fit_params).

+
+ +
+ Q13: What are f-strings and why are they preferred over .format() and %? +

Answer: f-strings (3.6+) are fastest, most readable formatting. They support expressions: f"{accuracy:.2%}" โ†’ "95.23%", f"{x=}" (3.8+) โ†’ "x=42" for debugging. .format() is slower and more verbose. % formatting is legacy C-style. Always use f-strings in modern Python.

+
+ +
+ Q14: Explain the LEGB scope rule. +

Answer: Python resolves names in order: Local โ†’ Enclosing function โ†’ Global โ†’ Built-in. This is why you can accidentally shadow built-ins: list = [1,2] breaks list(). Use nonlocal to modify enclosing scope, global for module scope (but avoid globals in production code).

+
+ +
+ Q15: What's the difference between append() and extend()? +

Answer: append(x) adds x as a single element. extend(iterable) unpacks and adds each element. [1,2].append([3,4]) โ†’ [1,2,[3,4]]. [1,2].extend([3,4]) โ†’ [1,2,3,4]. Use extend() when merging feature lists; append() when adding one item to results.

+
+
+ ` + }, + + "numpy": { + concepts: ` +
+

NumPy ndarray Fundamentals

+ +

๐Ÿง  Why NumPy Is 50-100x Faster Than Python Lists

+ + + + + + + +
FeaturePython ListNumPy ndarray
StorageArray of pointers to objects scattered in memoryContiguous block of raw typed data
TypeEach element can be different typeHomogeneous โ€” all elements same dtype
OperationsPython loop (bytecode interpretation)Compiled C/Fortran loops
Memory~28 bytes per int + pointer overhead8 bytes per int64 (no overhead)
SIMDNot possibleUses CPU vector instructions (SSE/AVX)
+ +

Memory Layout: C-Order vs Fortran-Order

+
+
โšก Performance-Critical Knowledge
+
+ C-order (row-major): Rows stored contiguously. arr[0,0], arr[0,1], arr[0,2], arr[1,0]...
+ Fortran-order (col-major): Columns stored contiguously. arr[0,0], arr[1,0], arr[2,0], arr[0,1]...
+ NumPy defaults to C-order. Iterating along the last axis is fastest (cache-friendly). Fortran-order is preferred when interfacing with LAPACK/BLAS (used internally by NumPy's linear algebra). +
+
+ +

Strides: The Secret Behind Views

+

Every ndarray has a strides tuple โ€” bytes to jump in each dimension. For a (3,4) float64 array: strides = (32, 8) means jump 32 bytes for next row, 8 bytes for next column. Slicing creates views (no copy) by adjusting strides. arr[::2] doubles the row stride.

+ +

Broadcasting Rules โ€” The Complete Picture

+
+
๐ŸŽฏ Broadcasting Rules (Right to Left)
+
+ Two arrays are compatible when, for each trailing dimension: (1) Dimensions are equal, OR (2) One of them is 1.
+ Example: (5, 3, 1) + (1, 4) โ†’ shape (5, 3, 4). The (1,) dims are "stretched" virtually โ€” no memory is copied. +
+
+ +

Key dtype Choices for DS

+ + + + + + + +
dtypeBytesRangeWhen to Use
float324ยฑ3.4e38Deep learning (GPU prefers this), 50% less memory
float648ยฑ1.8e308Default. Scientific computing, high-precision stats
int324ยฑ2.1 billionIndices, counts, most integer data
bool1True/FalseMasks for filtering
category (Pandas)VariesFinite setRepeated strings โ†’ 90% memory savings
+ +

np.einsum โ€” Einstein Summation (Power Tool)

+

np.einsum can express any tensor operation in one call: matrix multiply, trace, transpose, batch ops. Often faster than chaining NumPy functions because it avoids intermediate arrays.

+ +

Linear Algebra for ML

+ +
+ `, + code: ` +
+

๐Ÿ’ป NumPy Code Examples

+ +

Array Creation & Inspection

+
+import numpy as np + +# Create with specific dtypes for memory efficiency +X = np.random.randn(1000, 10).astype(np.float32) # 40KB vs 80KB +y = np.random.randint(0, 2, size=1000, dtype=np.int8) # 1KB vs 8KB + +# Inspect memory layout +print(f"Shape: {X.shape}") # (1000, 10) +print(f"Strides: {X.strides}") # (40, 4) bytes +print(f"C-contiguous: {X.flags['C_CONTIGUOUS']}") # True +print(f"Memory: {X.nbytes / 1024:.1f} KB") # 39.1 KB +
+ +

Broadcasting for Feature Normalization

+
+# Normalize each feature (mean=0, std=1) +X = np.random.randn(1000, 5) # 1000 samples, 5 features + +mean = X.mean(axis=0) # shape (5,) +std = X.std(axis=0) # shape (5,) + +X_normalized = (X - mean) / std # Broadcasting! (1000,5) - (5,) works + +# Min-Max scaling to [0, 1] +X_min = X.min(axis=0) +X_max = X.max(axis=0) +X_scaled = (X - X_min) / (X_max - X_min + 1e-8) # epsilon avoids /0 +
+ +

Advanced Indexing & Boolean Masking

+
+# Boolean mask โ€” filter outliers (3 sigma rule) +data = np.random.randn(10000) +mask = np.abs(data) < 3 # True where within 3 std devs +clean = data[mask] # Only non-outlier values +print(f"Removed {len(data) - len(clean)} outliers") + +# Fancy indexing โ€” select specific rows/columns +X = np.random.randn(100, 10) +important_features = [0, 3, 7] # indices of best features +X_selected = X[:, important_features] # shape (100, 3) + +# np.where โ€” Conditional replacement +predictions = np.array([0.3, 0.7, 0.1, 0.9]) +labels = np.where(predictions > 0.5, 1, 0) # [0, 1, 0, 1] +
+ +

np.einsum โ€” One Function to Rule Them All

+
+A = np.random.randn(3, 4) +B = np.random.randn(4, 5) + +# Matrix multiply: C_ij = sum_k A_ik * B_kj +C = np.einsum('ik,kj->ij', A, B) # same as A @ B + +# Trace: sum of diagonal +trace = np.einsum('ii->', np.eye(4)) # 4.0 + +# Batch matrix multiply (common in deep learning) +batch_A = np.random.randn(32, 10, 20) # 32 matrices +batch_B = np.random.randn(32, 20, 5) +result = np.einsum('bij,bjk->bik', batch_A, batch_B) # (32,10,5) + +# Dot product of each row pair +X = np.random.randn(100, 768) # embeddings +similarity = np.einsum('ij,kj->ik', X, X) # cosine sim matrix +
+ +

Linear Regression โ€” The NumPy Way

+
+# Solve linear regression: y = Xฮฒ + ฮต +# Normal equation: ฮฒ = (X^T X)^{-1} X^T y +X = np.column_stack([np.ones(100), np.random.randn(100, 3)]) +y = np.random.randn(100) + +# Method 1: Direct (numerically unstable for large X) +beta = np.linalg.inv(X.T @ X) @ X.T @ y + +# Method 2: lstsq (stable, handles rank-deficient X) +beta, residuals, rank, sv = np.linalg.lstsq(X, y, rcond=None) + +# Method 3: SVD decomposition (most stable) +U, S, Vt = np.linalg.svd(X, full_matrices=False) +beta = Vt.T @ np.diag(1/S) @ U.T @ y +
+ +

Memory-Mapped Files for Huge Arrays

+
+# Process arrays larger than RAM +# Create memory-mapped file +big_array = np.memmap('huge_data.npy', dtype=np.float32, + mode='w+', shape=(1000000, 100)) + +# Write data in chunks +for i in range(0, 1000000, 10000): + big_array[i:i+10000] = np.random.randn(10000, 100) + +# Read slices without loading the entire file +subset = big_array[5000:6000] # Only reads 1000 rows from disk +print(subset.mean()) +
+
+ `, + interview: ` +
+

๐ŸŽฏ NumPy Interview Questions

+ +
+ Q1: Why is NumPy faster than Python lists for numerical operations? +

Answer: Three reasons: (1) Contiguous memory โ€” CPU cache-friendly, no pointer chasing. (2) Compiled C loops โ€” operations run in compiled C, not interpreted Python. (3) SIMD instructions โ€” modern CPUs process 4-8 floats simultaneously (AVX). Together: 50-100x speedup.

+
+ +
+ Q2: What's the difference between a view and a copy? Why does it matter? +

Answer: Views share data (slicing creates views). Copies duplicate data. arr[::2] is a view โ€” modifying it modifies the original. arr[[0,2,4]] (fancy indexing) is a copy. Views are fast and memory-efficient. Use np.shares_memory(a, b) to check. Always .copy() when you need independent data.

+
+ +
+ Q3: Explain broadcasting rules with an example. +

Answer: Compare shapes right-to-left. Dimensions must be equal or one must be 1. Example: (3,1) + (1,4) โ†’ (3,4). Each (3,1) row is "stretched" to match 4 columns. No memory is actually copied โ€” NumPy adjusts strides internally. Gotcha: (3,) + (3,4) fails โ€” need to reshape to (3,1) first.

+
+ +
+ Q4: What is axis=0 vs axis=1? +

Answer: axis=0 = operate down rows (column-wise). axis=1 = across columns (row-wise). Think: axis=0 collapses rows, axis=1 collapses columns. For (100,5) array: mean(axis=0) โ†’ shape (5,) โ€” one mean per feature. mean(axis=1) โ†’ shape (100,) โ€” one mean per sample.

+
+ +
+ Q5: How would you implement PCA using only NumPy? +

Answer: (1) Center data: X_c = X - X.mean(axis=0), (2) Covariance: cov = X_c.T @ X_c / (n-1), (3) Eigendecomposition: vals, vecs = np.linalg.eigh(cov), (4) Sort by eigenvalue descending, (5) Project: X_pca = X_c @ vecs[:, -k:]. Alternatively use SVD directly: U, S, Vt = np.linalg.svd(X_c).

+
+ +
+ Q6: What's the difference between np.dot, np.matmul (@), and np.einsum? +

Answer: np.dot: flattens for 1D, matrix multiply for 2D, but confusing for higher dims. @ (matmul): clean matrix multiply, broadcasts over batch dims. einsum: most flexible โ€” express any contraction. Use @ for readability, einsum for complex ops. Avoid np.dot for 3D+ arrays.

+
+ +
+ Q7: How do you handle NaN values in NumPy arrays? +

Answer: np.isnan(arr) detects NaNs. np.nanmean(arr), np.nanstd(arr) โ€” nan-safe aggregations. Replace: arr[np.isnan(arr)] = 0. Gotcha: np.nan == np.nan is False! NaN poisons comparisons. This is IEEE 754 standard.

+
+ +
+ Q8: What's structured arrays and when would you use them over Pandas? +

Answer: Structured arrays have named fields with mixed dtypes: np.dtype([('name', 'U10'), ('age', 'i4'), ('score', 'f8')]). Use when: (1) You need NumPy speed without Pandas overhead, (2) Interfacing with binary file formats (HDF5, FITS), (3) Processing millions of records where Pandas is too slow.

+
+ +
+ Q9: Explain the performance difference between C-order and Fortran-order. +

Answer: C-order stores rows contiguously; Fortran stores columns. Iterating along the last axis of C-order arrays is fastest because adjacent elements are in adjacent memory (cache-friendly). For column-heavy operations, Fortran order can be faster. NumPy defaults to C-order. np.asfortranarray() converts.

+
+ +
+ Q10: How would you vectorize a custom function that doesn't have a NumPy equivalent? +

Answer: Three options in order of speed: (1) np.vectorize(func) โ€” convenience wrapper, NOT actually vectorized (still Python loops), (2) Rewrite using broadcasting + boolean masks, (3) Use @numba.jit(nopython=True) for true compiled speed. Always prefer option 2 when possible.

+
+ +
+ Q11: What's np.random.seed() vs np.random.RandomState vs np.random.default_rng()? +

Answer: np.random.seed(42): global state, not thread-safe. RandomState(42): isolated state, legacy. default_rng(42): modern (NumPy 1.17+), uses PCG64, thread-safe, better statistical properties. Always use default_rng() in new code.

+
+ +
+ Q12: How do you compute pairwise distances between all points efficiently? +

Answer: Use the expansion: ||a-b||ยฒ = ||a||ยฒ + ||b||ยฒ - 2aยทb. Code: dists = np.sum(X**2, axis=1)[:,None] + np.sum(X**2, axis=1)[None,:] - 2 * X @ X.T. This avoids the O(nยฒร—d) explicit loop and leverages BLAS matrix multiply. scipy.spatial.distance.cdist wraps this.

+
+
+ ` + }, + + "pandas": { + concepts: ` +
+

Pandas Core Concepts

+ +

๐Ÿง  DataFrame Internals โ€” What Actually Happens Under the Hood

+
+
โšก BlockManager Architecture
+
+ A DataFrame is NOT a 2D array. Internally, Pandas uses a BlockManager โ€” columns of the same dtype are stored together in contiguous NumPy arrays (blocks). When you add a column of a different type, a new block is created. This is why column operations are fast (same block) but row iteration is slow (crosses blocks). +
+
+ +

DataFrame vs Series

+ + + + + + +
FeatureSeriesDataFrame
Dimensions1D labeled array2D labeled table
AnalogyA column in a spreadsheetThe entire spreadsheet
IndexSingle indexRow index + column index
Creationpd.Series([1,2,3])pd.DataFrame({'a': [1,2]})
+ +

The .loc vs .iloc Decision Tree

+
+
๐ŸŽฏ Golden Rule
+
+ .loc = Label-based. Use when you know column/row names. Inclusive on both ends.
+ .iloc = Integer-based. Use when you know positions. Exclusive on end (like Python slicing).
+ Gotcha: df.loc[0:5] includes row 5. df.iloc[0:5] excludes row 5. This trips up everyone. +
+
+ +

The SettingWithCopyWarning โ€” Finally Explained

+

When you chain indexing (df[df.x > 0]['y'] = 5), Pandas may create a temporary copy. Your assignment modifies the copy, not the original. Fix: Always use .loc: df.loc[df.x > 0, 'y'] = 5. In Pandas 2.0+, Copy-on-Write mode eliminates this issue entirely.

+ +

GroupBy Split-Apply-Combine

+

GroupBy is the most powerful Pandas operation. It follows three steps: (1) Split data into groups, (2) Apply a function to each group independently, (3) Combine results. The key insight: GroupBy is lazy โ€” no computation happens until you call an aggregation.

+ +

Method Chaining โ€” The Pandas Way

+

Fluent API style chains multiple operations. More readable, no intermediate variables, and enables .pipe() for custom functions. Use .assign() instead of df['col'] = ... for chainability.

+ +

Memory Optimization Strategies

+ + + + + + +
StrategySavingsWhen to Use
Category dtype90%+Columns with few unique strings (gender, country)
Downcast numerics50-75%int64 to int32/int16 when range allows
Sparse arrays80%+Columns that are mostly zeros/NaN
Read in chunksN/AFiles larger than RAM
+
+ `, + code: ` +
+

๐Ÿ’ป Pandas Code Examples

+ +

Method Chaining โ€” Production Pattern

+
+import pandas as pd +import numpy as np + +# Method chaining โ€” clean, readable data pipeline +result = ( + pd.read_csv('sales.csv') + .rename(columns=str.lower) + .assign( +date=lambda df: pd.to_datetime(df['date']), +revenue=lambda df: df['price'] * df['quantity'], +month=lambda df: df['date'].dt.month + ) + .query('revenue > 100') + .groupby('month') + .agg({'revenue': ['sum', 'mean', 'count']}) + .sort_values(('revenue', 'sum'), ascending=False) +) +
+ +

GroupBy โ€” Beyond Basic Aggregation

+
+# Multi-aggregation with named columns +summary = df.groupby('category').agg( + total_sales=('revenue', 'sum'), + avg_price=('price', 'mean'), + num_orders=('order_id', 'nunique'), + top_product=('product', lambda x: x.mode().iloc[0]) +) + +# Transform โ€” apply function, keep original shape +df['pct_of_group'] = df.groupby('category')['revenue'].transform( + lambda x: x / x.sum() * 100 +) + +# Filter โ€” keep only groups meeting criteria +big_groups = df.groupby('category').filter( + lambda g: len(g) >= 10 +) +
+ +

Merge Patterns โ€” SQL Joins in Pandas

+
+# LEFT JOIN with indicator to find unmatched +merged = pd.merge( + orders, customers, + on='customer_id', + how='left', + indicator=True # adds _merge column +) +orphan_orders = merged[merged['_merge'] == 'left_only'] + +# Merge on different column names +result = pd.merge( + df1, df2, + left_on='user_id', + right_on='id', + suffixes=('_orders', '_users') +) +
+ +

Time Series Operations

+
+# Resample โ€” change frequency +daily = df.set_index('date') +weekly = daily['revenue'].resample('W').sum() +monthly = daily['revenue'].resample('M').agg(['sum', 'mean', 'count']) + +# Rolling windows โ€” moving averages +df['ma_7'] = df['revenue'].rolling(7).mean() +df['ma_30'] = df['revenue'].rolling(30).mean() +df['expanding_mean'] = df['revenue'].expanding().mean() + +# Shift โ€” create lag features for ML +df['prev_day'] = df['revenue'].shift(1) +df['pct_change'] = df['revenue'].pct_change() +
+ +

Memory Optimization

+
+# Reduce DataFrame memory by 70%+ +def optimize_dtypes(df): + for col in df.select_dtypes(include=['int']).columns: +df[col] = pd.to_numeric(df[col], downcast='integer') + for col in df.select_dtypes(include=['float']).columns: +df[col] = pd.to_numeric(df[col], downcast='float') + for col in df.select_dtypes(include=['object']).columns: +if df[col].nunique() / len(df) < 0.5: + df[col] = df[col].astype('category') + return df + +# Before: 800 MB โ†’ After: 200 MB +df = optimize_dtypes(df) +print(df.memory_usage(deep=True).sum() / 1e6, "MB") +
+
+ `, + interview: ` +
+

๐ŸŽฏ Pandas Interview Questions

+ +
+ Q1: What causes SettingWithCopyWarning and how do you fix it? +

Answer: Chained indexing (df[mask]['col'] = val) may modify a copy, not the original. Fix: use df.loc[mask, 'col'] = val. In Pandas 2.0+, enable Copy-on-Write: pd.options.mode.copy_on_write = True. This makes all indexing return views until modification, then copies automatically.

+
+ +
+ Q2: How do you handle a 10GB CSV that doesn't fit in memory? +

Answer: 5 strategies: (1) pd.read_csv(chunksize=50000) โ€” process in batches, (2) usecols=['needed_cols'] โ€” load only what you need, (3) dtype={'col': 'int32'} โ€” use smaller types, (4) Dask โ€” lazy Pandas-like API, (5) DuckDB โ€” SQL on CSV files with zero memory overhead. Polars is also excellent for out-of-core processing.

+
+ +
+ Q3: Explain the difference between merge, join, and concat. +

Answer: merge(): SQL-style joins on columns (most flexible). join(): joins on index (convenience wrapper). concat(): stack DataFrames along axis (union/append). Use merge for column-based joins, concat for stacking rows/columns. join is just merge with index.

+
+ +
+ Q4: What's the difference between apply, map, and applymap? +

Answer: map(): Series only, element-wise. apply(): works on rows/columns of DataFrame or elements of Series. applymap(): element-wise on entire DataFrame (renamed to map() in Pandas 2.1). Performance tip: all three are slow โ€” prefer vectorized operations whenever possible.

+
+ +
+ Q5: How does GroupBy transform differ from agg? +

Answer: agg() reduces โ€” returns one value per group (changes shape). transform() broadcasts โ€” returns same shape as input. Example: df.groupby('dept')['salary'].transform('mean') fills every row with its department's average salary, while .agg('mean') returns one row per department.

+
+ +
+ Q6: What is MultiIndex and when would you use it? +

Answer: Hierarchical indexing โ€” multiple levels of row/column labels. Use for: pivot table results, panel data (entity + time), groupby with multiple keys. Access with .xs() or tuple slicing: df.loc[('A', 2023)]. Convert back with .reset_index().

+
+ +
+ Q7: How do you handle missing data in production? +

Answer: Strategy depends on context: (1) dropna(thresh=N) โ€” keep rows with at least N non-null values, (2) fillna(method='ffill') โ€” forward fill for time series, (3) fillna(df.median()) โ€” impute with median for ML, (4) interpolate(method='time') โ€” time-weighted interpolation. Always check df.isna().sum() first.

+
+ +
+ Q8: What is the category dtype and when should you use it? +

Answer: Stores repeated strings as integer codes + lookup table. Use when a column has few unique values relative to total rows (e.g., 50 countries in 1M rows). Benefits: 90%+ memory savings, faster groupby. Gotcha: operations that create new values (like string concatenation) convert back to object dtype.

+
+ +
+ Q9: Pandas vs Polars vs DuckDB โ€” when to use each? +

Answer: Pandas: best ecosystem, most tutorials, sufficient for <1GB. Polars: 10-100x faster, lazy evaluation, multi-threaded, no GIL issues โ€” use for 1-100GB. DuckDB: SQL interface, out-of-core, great for analytical queries โ€” use when SQL is more natural or data exceeds RAM.

+
+ +
+ Q10: How do you create lag features and rolling statistics for time series ML? +

Answer: df['lag_1'] = df['value'].shift(1) for lag features. df['rolling_mean_7'] = df['value'].rolling(7).mean() for rolling stats. df['ewm_mean'] = df['value'].ewm(span=7).mean() for exponential weighted. Always sort by time first, use groupby().shift() for multi-entity data to avoid data leakage.

+
+
+ ` + }, + + "visualization": { + concepts: ` +
+

Data Visualization Principles

+ +

๐Ÿง  The Grammar of Graphics

+
+
โšก Every Chart = Data + Aesthetics + Geometry
+
+ Leland Wilkinson's framework: Data (what to plot), Aesthetics (x, y, color, size mappings), Geometry (bars, lines, points), Statistics (binning, smoothing), Coordinates (cartesian, polar), Facets (subplots). Seaborn and Plotly follow this pattern. Understanding it means you can build any chart. +
+
+ +

Choosing the Right Chart

+ + + + + + + + + +
QuestionChart TypeLibrary
Distribution of one variable?Histogram, KDE, Box plotSeaborn
Relationship between two variables?Scatter, Hexbin, RegressionSeaborn/Plotly
Comparison across categories?Bar, Grouped bar, ViolinSeaborn
Trend over time?Line chart, Area chartPlotly/Matplotlib
Correlation matrix?HeatmapSeaborn
Part of whole?Pie, Treemap, SunburstPlotly
Geographic data?Choropleth, Scatter mapboxPlotly/Folium
+ +

Matplotlib Architecture

+

Three layers: Backend (rendering engine), Artist (everything drawn), Scripting (pyplot). The Figure contains Axes (subplots). Each Axes has Axis objects. Always prefer the object-oriented API (fig, ax = plt.subplots()) over pyplot for production code.

+ +

Seaborn โ€” Statistical Visualization

+

Built on Matplotlib with statistical intelligence. Three API levels: Figure-level (relplot, catplot, displot โ€” create their own figure), Axes-level (scatterplot, boxplot โ€” plot on existing axes), Objects API (new in 0.12, more composable).

+ +

Plotly โ€” Interactive Dashboards

+

JavaScript-powered charts with hover, zoom, selection. plotly.express for quick plots, plotly.graph_objects for full control. Integrates with Dash for production dashboards. Supports 3D plots, maps, and animations.

+ +

Common Visualization Mistakes

+ +
+ `, + code: ` +
+

๐Ÿ’ป Visualization Code Examples

+ +

Matplotlib โ€” Publication-Quality Figures

+
+import matplotlib.pyplot as plt +import numpy as np + +# Professional figure setup +fig, axes = plt.subplots(1, 3, figsize=(15, 5)) + +# Subplot 1: Distribution +data = np.random.randn(1000) +axes[0].hist(data, bins=30, alpha=0.7, color='steelblue', edgecolor='white') +axes[0].set_title('Distribution', fontsize=14, fontweight='bold') +axes[0].axvline(data.mean(), color='red', linestyle='--', label='Mean') + +# Subplot 2: Scatter with colormap +x, y = np.random.randn(2, 100) +scatter = axes[1].scatter(x, y, c=y, cmap='viridis', alpha=0.7) +plt.colorbar(scatter, ax=axes[1]) + +# Subplot 3: Line with confidence interval +x = np.linspace(0, 10, 100) +y = np.sin(x) +axes[2].plot(x, y, 'b-', linewidth=2) +axes[2].fill_between(x, y-0.3, y+0.3, alpha=0.2) + +plt.tight_layout() +plt.savefig('figure.png', dpi=300, bbox_inches='tight') +
+ +

Seaborn โ€” Statistical Plots

+
+import seaborn as sns + +# Pair plot โ€” see all relationships at once +sns.pairplot(df, hue='target', diag_kind='kde', + plot_kws={'alpha': 0.6}) + +# Correlation heatmap with annotations +fig, ax = plt.subplots(figsize=(10, 8)) +mask = np.triu(np.ones_like(df.corr(), dtype=bool)) +sns.heatmap(df.corr(), mask=mask, annot=True, fmt='.2f', + cmap='RdBu_r', center=0, square=True) + +# Violin + strip plot โ€” distribution + individual points +fig, ax = plt.subplots(figsize=(10, 6)) +sns.violinplot(x='category', y='value', data=df, inner=None, alpha=0.3) +sns.stripplot(x='category', y='value', data=df, size=3, jitter=True) +
+ +

Plotly โ€” Interactive Visualizations

+
+import plotly.express as px +import plotly.graph_objects as go + +# Interactive scatter with hover info +fig = px.scatter(df, x='feature1', y='feature2', + color='target', size='importance', + hover_data=['name'], + title='Feature Analysis') + +# Animated chart โ€” data over time +fig = px.scatter(df, x='gdp', y='life_exp', + animation_frame='year', + size='population', color='continent', + hover_name='country', + size_max=60) +fig.show() +
+
+ `, + interview: ` +
+

๐ŸŽฏ Visualization Interview Questions

+ +
+ Q1: When would you use Matplotlib vs Seaborn vs Plotly? +

Answer: Matplotlib: full control, publication figures, custom layouts. Seaborn: statistical plots, quick EDA, beautiful defaults. Plotly: interactive dashboards, web apps, 3D/maps. Rule of thumb: Seaborn for EDA, Matplotlib for papers, Plotly for stakeholders.

+
+ +
+ Q2: How do you visualize high-dimensional data? +

Answer: (1) PCA/t-SNE/UMAP to 2D then scatter plot, (2) Pair plots for feature pairs, (3) Parallel coordinates, (4) Heatmap of correlation matrix, (5) SHAP summary plots for feature importance. For 100+ features, start with correlation heatmap to identify groups.

+
+ +
+ Q3: How do you handle overplotting in scatter plots? +

Answer: (1) Reduce alpha: alpha=0.1, (2) Hexbin plots: plt.hexbin(), (3) 2D KDE: sns.kdeplot(), (4) Random sampling for display, (5) Datashader for millions of points. The key is encoding density visually.

+
+ +
+ Q4: What makes a good visualization for non-technical stakeholders? +

Answer: (1) Clear title stating the conclusion, not the method, (2) Minimal chart junk โ€” remove gridlines, borders, legends when obvious, (3) Annotate key data points directly, (4) Use color consistently and meaningfully, (5) Tell a story โ€” what action should they take? Keep it to one insight per chart.

+
+ +
+ Q5: Explain the Figure and Axes API in Matplotlib. +

Answer: Figure is the entire window/canvas. Axes is a single plot area within the figure. fig, axes = plt.subplots(2,2) creates 4 plots. Always use the OO API for production โ€” ax.plot() not plt.plot(). This gives you explicit control over which subplot you're modifying.

+
+ +
+ Q6: How do you make accessible visualizations? +

Answer: (1) Use colorblind-safe palettes (viridis, cividis), (2) Don't rely on color alone โ€” add shapes/patterns, (3) Sufficient contrast ratios, (4) Alt text for web charts, (5) Large enough font sizes (12pt minimum). Test with colorblindness simulators.

+
+
+ ` + }, + + "advanced-python": { + concepts: ` +
+

Advanced Python Engineering

+ +

๐Ÿง  Professional Decorators โ€” Beyond "Hello World"

+
+
โšก Closures & Wrappers
+
+ Decorators are higher-order functions that modify behavior without changing code. Professional implementation tools: Use functools.wraps to preserve metadata (name, docstring), handle both positional and keyword arguments, and support decorators with parameters (factories). +
+
+ +

Context Managers (The Pythonic Way)

+

Managing resources (files, locks, DB connections) reliably. with blocks guarantee cleanup even on errors. Implementation options: (1) Class-based with __enter__ and __exit__, (2) Function-based with @contextlib.contextmanager and yield.

+ +

Iterators & Generators โ€” Memory Efficiency

+
+
๐Ÿ’ก Why Generators?
+ Generators use lazy evaluation. They produce values one at a time using yield, using constant memory O(1) regardless of dataset size. Ideal for processing huge datasets or infinite streams. +
+ +

Object-Oriented Design for Data Science

+ + + + + + +
ConceptData Science Use Case
InheritanceBaseModel โ†’ LinearModel โ†’ LogisticRegression
Abstract Base ClassesDefining mandatory methods like fit()/predict()
PropertiesValidating input parameters (e.g., learning rate > 0)
Dunder Methods__call__ for making models callable, __getitem__ for datasets
+ +

Metaclasses & Dynamic Programming

+

Classes are objects too! Classes define how instances behave; Metaclasses define how classes behave. Useful for registry patterns (auto-registering models) or enforcement of interface standards across a codebase. type is the default metaclass.

+
+ `, + code: ` +
+

๐Ÿ’ป Advanced Python Code Examples

+ +

The Production-Grade Decorator

+
+from functools import wraps +import time +import logging + +def timer_with_logging(logger): + def decorator(func): +@wraps(func) +def wrapper(*args, **kwargs): + start = time.perf_counter() + try: + result = func(*args, **kwargs) + return result + finally: + duration = time.perf_counter() - start + logger.info(f"Executed {func.__name__} in {duration:.4f}s") +return wrapper + return decorator + +@timer_with_logging(logging.getLogger(__name__)) +def train_model(X, y): + # Simulate training + time.sleep(1.5) +
+ +

Custom Context Manager for GPU Lock

+
+from contextlib import contextmanager + +@contextmanager +def gpu_lock(device_id): + print(f"Acquiring lock for GPU {device_id}") + try: +yield f"GPU_{device_id}_CONTEXT" + finally: +print(f"Releasing GPU {device_id}") + +with gpu_lock(0) as ctx: + print(f"Training with {ctx}") +
+ +

ABC & Protocol โ€” Enforcing Interfaces

+
+from abc import ABC, abstractmethod +from typing import Protocol + +class Predictor(Protocol): + def predict(self, X: np.ndarray) -> np.ndarray: ... + +class BaseModel(ABC): + @abstractmethod + def fit(self, X, y): +pass + +class MyModel(BaseModel): + def fit(self, X, y): +print("Fitting...") + + def predict(self, X): +return X @ self.weights +
+ +

Functional Pipelines with itertools

+
+import itertools + +# Process infinite stream in batches +def get_batches(stream, size): + it = iter(stream) + while True: +batch = list(itertools.islice(it, size)) +if not batch: break +yield batch + +# Data pipeline: chain -> filter -> map -> batch +processed = get_batches( + map(str.upper, filter(lambda x: len(x) > 5, stream)), + batch_size=64 +) +
+
+ `, + interview: ` +
+

๐ŸŽฏ Advanced Python Interview Questions

+ +
+ Q1: What's the difference between __str__ and __repr__? +

Answer: __str__ is for end-users (informal, readable). __repr__ is for developers (detailed, unambiguous, "eval-able"). For data science, always implement __repr__ for models to show hyperparameters when printed.

+
+ +
+ Q2: Explain Python's MRO (Method Resolution Order). +

Answer: C3 Linearization algorithm. It determines the search order for methods in multiple inheritance. Access it via ClassName.mro(). Python ensures that bases are searched after their subclasses and the order of bases in the class definition is preserved.

+
+ +
+ Q3: How do you implement a Singleton pattern in Python? +

Answer: Several ways: (1) Overriding __new__, (2) Using a Metaclass (cleanest), (3) Module-level variables (simplest). Example with Metaclass: class Singleton(type): ... then class Database(metaclass=Singleton): ....

+
+ +
+ Q4: Decorators: How to handle @timer(unit='ms')? +

Answer: This is a decorator factory. You need three levels of functions: (1) Factory takes parameters and returns a decorator, (2) Decorator takes the function and returns a wrapper, (3) Wrapper takes args/kwargs and executes the logic.

+
+ +
+ Q5: What are *args and **kwargs and when to use them? +

Answer: *args collects positional arguments into a tuple. **kwargs collects keyword arguments into a dictionary. Crucial for wrapping functions, implementing decorators, or creating flexible API interfaces like Scikit-learn's __init__(**params).

+
+ +
+ Q6: Explain the difference between is and ==. +

Answer: == checks for equality (values are the same). is checks for identity (objects occupy the same memory address). Use is for Singletons like None or bool. Example: a = [1]; b = [1]; a == b is True, a is b is False.

+
+
+ ` + }, + "sklearn": { + concepts: ` +
+

Scikit-learn & ML Engineering

+ +

๐Ÿง  The Estimator API โ€” Unified Interface

+
+
โšก Consistency is King
+
+ Scikit-learn's brilliance lies in its interface consistency. Estimators have fit(X, y), Transformers have transform(X), and Predictors have predict(X). This design allows for seamless swapping of models and preprocessing steps. +
+
+ +

Production Pipelines โ€” Avoiding Data Leakage

+

A Pipeline bundles preprocessing and modeling into a single object. Crucial Benefit: It ensures that transformers are fit only on the training fold during cross-validation, preventing information from the validation set (like mean/std) from "leaking" into training. Always use pipelines in production.

+ +

ColumnTransformer โ€” Heterogeneous Data

+

Most real-world data is a mix of types. ColumnTransformer allows you to apply different preprocessing pipelines to different columns (e.g., OneHotEncode categories, Scale numerics) and then concatenate them for the model.

+ +

Model Evaluation Beyond Accuracy

+ + + + + + + +
MetricUse CaseScikit-learn Name
F1-ScoreImbalanced classification (Precision-Recall balance)f1_score
ROC-AUCProbability ranking / classifier qualityroc_auc_score
MSE / MAERegression error magnitudemean_squared_error
R2 ScoreVariance explained by modelr2_score
Log LossProbabilistic predictions confidencelog_loss
+ +

Cross-Validation Strategies

+

(1) K-Fold: standard, (2) Stratified K-Fold: for imbalanced data, (3) TimeSeriesSplit: for temporal data (preventing looking into the future), (4) GroupKFold: to ensure samples from the same group aren't split across train/test.

+
+ `, + code: ` +
+

๐Ÿ’ป Scikit-learn Code Examples

+ +

The Modular Pipeline Pattern

+
+from sklearn.pipeline import Pipeline +from sklearn.compose import ColumnTransformer +from sklearn.preprocessing import StandardScaler, OneHotEncoder +from sklearn.ensemble import RandomForestClassifier + +# Define preprocessing for different feature types +numeric_transformer = Pipeline(steps=[ + ('scaler', StandardScaler()) +]) + +categorical_transformer = Pipeline(steps=[ + ('onehot', OneHotEncoder(handle_unknown='ignore')) +]) + +preprocessor = ColumnTransformer(transformers=[ + ('num', numeric_transformer, numeric_features), + ('cat', categorical_transformer, categorical_features) +]) + +# Create full pipeline +clf = Pipeline(steps=[ + ('preprocessor', preprocessor), + ('classifier', RandomForestClassifier(n_estimators=100)) +]) + +# Entire workflow in one object +clf.fit(X_train, y_train) +preds = clf.predict(X_test) +
+ +

Custom Transformers โ€” Industry Standard

+
+from sklearn.base import BaseEstimator, TransformerMixin + +class LogTransformer(BaseEstimator, TransformerMixin): + def __init__(self, columns=None): +self.columns = columns + + def fit(self, X, y=None): +return self + + def transform(self, X): +X_out = X.copy() +for col in self.columns: + X_out[col] = np.log1p(X_out[col]) +return X_out + +# Now usable in any Pipeline +pipeline = Pipeline([ + ('log', LogTransformer(columns=['revenue'])), + ('model', LinearRegression()) +]) +
+ +

Hyperparameter Optimization (Advanced)

+
+from sklearn.model_selection import RandomizedSearchCV +from scipy.stats import randint + +param_dist = { + 'classifier__n_estimators': randint(50, 500), + 'classifier__max_depth': [5, 10, 20, None], + 'preprocessor__num__scaler': [StandardScaler(), RobustScaler()] +} + +search = RandomizedSearchCV(clf, param_dist, n_iter=50, cv=3) +search.fit(X_train, y_train) + +print(f"Best Score: {search.best_score_}") +print(f"Best Params: {search.best_params_}") +
+
+ `, + interview: ` +
+

๐ŸŽฏ Scikit-learn Interview Questions

+ +
+ Q1: Why use fit_transform on train but only transform on test? +

Answer: To prevent Data Leakage. Mean/variance for scaling must be learned ONLY from training data. Applying fit to test data uses future information about the test distribution, leading to overly optimistic results.

+
+ +
+ Q2: When would you use predict_proba instead of predict? +

Answer: When you need the uncertainty of the model or need to adjust the decision threshold. For cost-sensitive problems (e.g., fraud), you might flag anything with >10% probability, rather than the default 50%.

+
+ +
+ Q3: Explain the bias-variance tradeoff in terms of Complexity. +

Answer: Underfitting (High Bias) happens when the model is too simple (e.g., linear on non-linear data). Overfitting (High Variance) happens when the model is too complex and captures noise. Regularization (Alpha/C parameters) is used to find the "sweet spot".

+
+ +
+ Q4: How do you handle imbalanced datasets in Sklearn? +

Answer: (1) class_weight='balanced' inside estimators, (2) Stratified cross-validation, (3) Focus on Precision-Recall curves/AUC instead of Accuracy, (4) Resampling (using imblearn library which is Sklearn-compatible).

+
+ +
+ Q5: What's the difference between L1 (Lasso) and L2 (Ridge) regularization? +

Answer: L1 adds absolute value penalty; it results in sparse models (coefficents become exactly zero), effectively performing feature selection. L2 adds squared penalty; it shrinks coefficients towards zero but rarely to zero, good for handling multicollinearity.

+
+
+ ` + }, + "pytorch": { + concepts: ` +
+

PyTorch & Deep Learning Primitives

+ +

๐Ÿง  Computational Graphs & Autograd

+
+
โšก Dynamic vs Static
+
+ PyTorch uses Dynamic Computational Graphs (Define-by-Run). The graph is built on-the-fly as operations are performed. Autograd tracks every operation on tensors with requires_grad=True and automatically computes gradients using the chain rule during .backward(). +
+
+ +

Tensors โ€” The Heart of PyTorch

+

Tensors are multi-dimensional arrays (like NumPy) but with two superpowers: (1) GPU Acceleration (move to 'cuda' or 'mps'), (2) Automatic Differentiation. Bridging to NumPy is zero-copy for CPU tensors.

+ +

Modular Architecture (nn.Module)

+

Every model in PyTorch inherits from nn.Module. You define parameters/layers in __init__ and the forward pass logic in forward(). This design promotes recursive composition โ€” models can contain other modules.

+ +

Data Engineering: Dataset & DataLoader

+ + + + + +
ComponentResponsibility
DatasetDefines HOW to load a single sample (__getitem__) and total count (__len__)
DataLoaderHandles batching, shuffling, multi-process loading, and memory pinning
TransformsOn-the-fly augmentation (cropping, flipping, normalizing)
+ +

The Optimization Loop Essentials

+

Standard pattern: (1) Zero gradients, (2) Forward pass, (3) Compute Loss, (4) Backward pass (backprop), (5) Optimizer step. Don't forget model.train() and model.eval() to toggle dropout and batch norm behavior.

+
+ `, + code: ` +
+

๐Ÿ’ป PyTorch Code Examples

+ +

The Ultimate Training Boilerplate

+
+import torch +import torch.nn as nn +import torch.optim as optim + +# Device agnostic code +device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + +# 1. Define Architecture +class SimpleNet(nn.Module): + def __init__(self): +super().__init__() +self.flatten = nn.Flatten() +self.fc = nn.Sequential( + nn.Linear(28*28, 512), + nn.ReLU(), + nn.Dropout(0.2), + nn.Linear(512, 10) +) + + def forward(self, x): +x = self.flatten(x) +return self.fc(x) + +model = SimpleNet().to(device) +optimizer = optim.Adam(model.parameters(), lr=1e-3) +criterion = nn.CrossEntropyLoss() + +# 2. Training Loop +model.train() +for batch, (X, y) in enumerate(dataloader): + X, y = X.to(device), y.to(device) + + # Zero -> Forward -> Backward -> Step + optimizer.zero_grad() + pred = model(X) + loss = criterion(pred, y) + loss.backward() + optimizer.step() +
+ +

Custom Dataset Implementation

+
+from torch.utils.data import Dataset + +class ImageDataset(Dataset): + def __init__(self, annotations_file, img_dir, transform=None): +self.img_labels = pd.read_csv(annotations_file) +self.img_dir = img_dir +self.transform = transform + + def __len__(self): +return len(self.img_labels) + + def __getitem__(self, idx): +img_path = os.path.join(self.img_dir, self.img_labels.iloc[idx, 0]) +image = read_image(img_path) +label = self.img_labels.iloc[idx, 1] +if self.transform: + image = self.transform(image) +return image, label +
+ +

Transfer Learning โ€” Freezing Layers

+
+from torchvision import models + +model = models.resnet18(pretrained=True) + +# Freeze all weights +for param in model.parameters(): + param.requires_grad = False + +# Replace final head (newly initialized, so requires_grad=True) +num_ftrs = model.fc.in_features +model.fc = nn.Linear(num_ftrs, 2) + +model = model.to(device) +# Only model.fc.parameters() will be updated +optimizer = optim.SGD(model.fc.parameters(), lr=0.001) +
+
+ `, + interview: ` +
+

๐ŸŽฏ PyTorch Interview Questions

+ +
+ Q1: Why is optimizer.zero_grad() necessary? +

Answer: By default, PyTorch accumulates gradients on every .backward() call. This is useful for RNNs or training with effectively larger batch sizes than memory allows. If you don't zero them out, gradients from previous batches will influence the current update, leading to incorrect training.

+
+ +
+ Q2: What is the difference between model.train() and model.eval()? +

Answer: They set the mode for specific layers. .train() enables Dropout and Batch Normalization (calculates stats for current batch). .eval() disables dropout and uses running averages for Batch Norm. Forgetting .eval() during testing will lead to inconsistent/bad predictions.

+
+ +
+ Q3: Explain the role of torch.no_grad(). +

Answer: It's a context manager that disables gradient calculation. Use it during inference or validation to save memory and compute resources. It prevents the creation of the computational graph for those operations.

+
+ +
+ Q4: PyTorch vs TensorFlow โ€” technical tradeoffs? +

Answer: PyTorch (Dynamic graph) is more Pythonic, easier to debug with standard tools, and highly favored in research. TensorFlow (Static graph/Keras) historically had better deployment tools (TFLite, TFServing) and massive industry scale, though the gap has significantly narrowed with PyTorch 2.0 and TorchServe.

+
+ +
+ Q5: What is "Tensor Broadcasting" in PyTorch? +

Answer: Same as NumPy. If dimensions don't match, PyTorch automatically expands the smaller tensor (by repeating values) to match the larger one, provided they are compatible (trailing dimensions match or are 1). This happens without actual memory copying.

+
+
+ ` + }, + "tensorflow": { + concepts: ` +
+

TensorFlow & Production DL

+ +

๐Ÿง  The Keras Ecosystem

+
+
โšก User-First API
+
+ Keras is TensorFlow's high-level API. It focuses on Developer Experience (DX) โ€” minimizing the number of user actions for common use cases. tf.keras supports three ways to build models: (1) Sequential (simple stacks), (2) Functional (DAGs, multi-input/output), (3) Subclassing (full control). +
+
+ +

tf.data โ€” Performance Pipelines

+

Loading data is often the bottleneck. tf.data.Dataset enables "ETL" pipelines: Extract (from disk/cloud), Transform (shuffle, batch, repeat), Load (map to GPU). Concepts like prefetch and interleave ensure the GPU is never waiting for the CPU.

+ +

Static Graphs & tf.function

+

TensorFlow can convert Python code into a Static Computational Graph using @tf.function. This enables significant optimizations like constant folding and makes models exportable to environments without Python (C++, Java, JS).

+ +

Monitoring with TensorBoard

+ + + + + + +
ComponentVisualized metric
ScalarsLoss/Accuracy curves in real-time
HistogramsWeights/Gradients distribution (checking for vanishing/exploding)
GraphsThe internal model architecture
ProjectorHigh-dimensional embeddings (t-SNE/PCA)
+ +

Deployment Architecture (TFX)

+

TensorFlow Extended (TFX) is for end-to-end ML. Key components: TF Serving (for APIs), TF Lite (for mobile/edge), TFJS (for web browsers). TF Serving supports model versioning and A/B testing out of the box.

+
+ `, + code: ` +
+

๐Ÿ’ป TensorFlow Code Examples

+ +

The Functional API Pattern

+
+import tensorflow as tf +from tensorflow import keras +from tensorflow.keras import layers + +# Functional API โ€” best for production model logic +inputs = keras.Input(shape=(784,)) +x = layers.Dense(64, activation="relu")(inputs) +x = layers.Dense(64, activation="relu")(x) +outputs = layers.Dense(10, activation="softmax")(x) + +model = keras.Model(inputs=inputs, outputs=outputs, name="mnist_model") + +model.compile( + loss=keras.losses.SparseCategoricalCrossentropy(), + optimizer=keras.optimizers.RMSprop(), + metrics=["accuracy"], +) + +history = model.fit(x_train, y_train, batch_size=64, epochs=2, validation_split=0.2) +
+ +

High-Performance Data Pipeline

+
+def load_and_preprocess(path, label): + image = tf.io.read_file(path) + image = tf.image.decode_jpeg(image, channels=3) + image = tf.image.resize(image, [224, 224]) + return image / 255.0, label + +dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)) +dataset = (dataset + .shuffle(1000) + .map(load_and_preprocess, num_parallel_calls=tf.data.AUTOTUNE) + .batch(32) + .prefetch(tf.data.AUTOTUNE) # Overlap training and preprocessing +) +
+ +

Custom Layers & Training Loops

+
+# Custom Training Loop (GradientTape) +optimizer = tf.keras.optimizers.Adam() +loss_fn = tf.keras.losses.BinaryCrossentropy() + +@tf.function # Compiles to static graph for speed +def train_step(x, y): + with tf.GradientTape() as tape: +logits = model(x, training=True) +loss_value = loss_fn(y, logits) + + grads = tape.gradient(loss_value, model.trainable_weights) + optimizer.apply_gradients(zip(grads, model.trainable_weights)) + return loss_value +
+
+ `, + interview: ` +
+

๐ŸŽฏ TensorFlow Interview Questions

+ +
+ Q1: What is tf.function and AutoGraph? +

Answer: tf.function is a decorator that converts a regular Python function into a TensorFlow static graph. AutoGraph is the internal tool that translates Python control flow (if, while) into TF graph ops. This allows for compiler-level optimizations and easy deployment without a Python environment.

+
+ +
+ Q2: Why use tf.data.AUTOTUNE? +

Answer: It allows TensorFlow to dynamically adjust the level of parallelism and buffer sizes based on your CPU/disk hardware. It ensures that data preprocessing (CPU) is always one step ahead of model training (GPU), preventing hardware starvation.

+
+ +
+ Q3: Functional API vs Sequential vs Subclassing? +

Answer: Sequential: purely linear stacks. Functional: most common for production, supports non-linear topology (shared layers, multiple inputs/outputs). Subclassing: full control over the forward pass, best for complex research/custom logic. Functional is generally preferred for its balance of power and debugging ease.

+
+ +
+ Q4: How do you prevent overfitting in TensorFlow? +

Answer: (1) EarlyStopping callback, (2) Dropout layers, (3) L1/L2 kernels regularizers, (4) Data augmentation (via tf.image or keras.layers), (5) Learning rate schedules via callbacks.ReduceLROnPlateau.

+
+ +
+ Q5: What is SavedModel format? +

Answer: The language-neutral, hermetic serialization format for TF models. It includes the model architecture, weights, and the computational graph (signatures). It is the standard format for TF Serving and TFLite conversion.

+
+
+ ` + }, + "production": { + concepts: ` +
+

Production Python & MLOps

+ +

๐Ÿง  FastAPI โ€” The Modern Standard

+
+
โšก High Performance APIs
+
+ FastAPI is built on Starlette and Pydantic. It supports async/await for handling concurrent requests without blocking, uses type hints for automatic validation, and generates interactive OpenAPI (Swagger) documentation. It is the gold standard for serving ML models today. +
+
+ +

Pydantic & Data Validation

+

In production, you cannot trust input data. Pydantic enforces strict type checking and validation at runtime. If a JSON request arrives with a string instead of a float for a model feature, Pydantic catches it immediately and returns a clear error before the model even sees it.

+ +

The ML Model Serving Lifecycle

+ + + + + + +
StageResponsibilityTools
InitializationLoading model weights into memory (once)FastAPI Lifespan
InferencePreprocessing input and getting predictionNumPy/Pydantic
Post-processingFormatting prediction for the clientJSON/Protobuf
ObservabilityLogging latency, inputs, and driftPrometheus/ELK
+ +

Dependency Management & Docker

+

Conda vs Pip: Pip is standard for Python; Conda is better for C-extensions/CUDA. Docker: Containerizing the environment ensures it "works on my machine" translates to "works in the cloud". Use lightweight base images (python:3.10-slim) to minimize security risks and build times.

+ +

Testing ML Applications

+

(1) Unit tests: for preprocessing logic, (2) Integration tests: for the API endpoints, (3) Model Quality tests: ensuring the model meets a minimum accuracy threshold on a benchmark dataset before deployment.

+
+ `, + code: ` +
+

๐Ÿ’ป Production Python Code Examples

+ +

The FastAPI Model Server Pattern

+
+from fastapi import FastAPI, HTTPException +from pydantic import BaseModel +import joblib + +app = FastAPI(title="ML Model API") + +# 1. Prediction Schema +class PredictionInput(BaseModel): + feature_1: float + feature_2: float + category: str + +# 2. Global Predictor Registry +model = None + +@app.on_event("startup") +def load_model(): + global model + model = joblib.load('model.joblib') + +@app.post("/predict") +async def predict(data: PredictionInput): + try: +features = [[data.feature_1, data.feature_2]] +prediction = model.predict(features) +return {"prediction": float(prediction[0])} + except Exception as e: +raise HTTPException(status_code=500, detail=str(e)) +
+ +

Robust Logging Strategy

+
+import logging +import sys + +def get_logger(name): + logger = logging.getLogger(name) + logger.setLevel(logging.INFO) + + # JSON formatter for easier ELK/Splunk ingestion + handler = logging.StreamHandler(sys.stdout) + formatter = logging.Formatter( +'{"time":"%(asctime)s", "name":"%(name)s", "level":"%(levelname)s", "msg":"%(message)s"}' + ) + handler.setFormatter(formatter) + logger.addHandler(handler) + return logger +
+ +

Docker Configuration (ML Specific)

+
+# Dockerfile for ML Service +FROM python:3.10-slim + +WORKDIR /app +COPY requirements.txt . + +# Install dependencies without cache +RUN pip install --no-cache-dir -r requirements.txt + +COPY . . + +# Expose port and run server +EXPOSE 8000 +CMD ["uvicorn", "main:app", "--host", "0.0.0.0", "--port", "8000"] +
+
+ `, + interview: ` +
+

๐ŸŽฏ Production Interview Questions

+ +
+ Q1: Why use FastAPI over Flask for ML models? +

Answer: (1) Native async support (handles concurrent requests better), (2) Automatically generates Swagger UI for testing, (3) Pydantic integration for data validation, (4) Significantly higher throughput (close to Go/Node.js levels), (5) Built-in support for WebSockets and background tasks.

+
+ +
+ Q2: How do you handle model versioning in production? +

Answer: (1) URL versioning (/v1/predict), (2) Model registry (MLflow/SageMaker) with aliases like production or staging, (3) Blue-green deployment โ€” route traffic to the new version only after validation, (4) Embed the model version in the API response metadata for debugging.

+
+ +
+ Q3: What is "Dependency Hell" and how do you solve it? +

Answer: It occurs when multiple libraries require conflicting versions of the same dependency. Solved by: (1) Using virtual environments (venv/conda), (2) pinning exact versions in requirements.txt or poetry.lock, (3) Docker to isolate the entire OS environment.

+
+ +
+ Q4: How do you log inputs/outputs without violating privacy? +

Answer: (1) PII Masking: remove names/emails/IDs before logging, (2) Hash sensitive fields if they are needed for troubleshooting, (3) Separate logging of model metadata from raw data, (4) Use specialized monitoring tools like Arize or Whylogs for drift detection without full data capture.

+
+ +
+ Q5: What's the role of CI/CD in Machine Learning? +

Answer: Beyond standard code tests, ML CI/CD (MLOps) includes Data Validation (is the incoming data schema correct?), Model Validation (is accuracy >= 90%?), and automated deployment to staging for human-in-the-loop review.

+
+
+ ` + }, + "optimization": { + concepts: ` +
+

Python High Performance & Optimization

+ +

๐Ÿง  The GIL (Global Interpreter Lock) โ€” Deep Dive

+
+
โšก The Bottleneck of Python
+
+ The GIL is a mutex that protects access to Python objects, preventing multiple native threads from executing Python bytecodes at once. Critical for DS: NumPy and Pandas release the GIL during C-level computations. Therefore, vectorized code IS truly parallel across CPU cores even with the GIL. +
+
+ +

Profiling: Finding the Real Bottleneck

+

Never optimize without measuring. (1) cProfile: for function-level timing, (2) line_profiler: for line-by-line analysis in "hot" functions, (3) memory_profiler: to detect memory leaks and peak usage, (4) Py-Spy: a sampling profiler for zero-instrumentation production profiling.

+ +

Numba โ€” JIT Compilation for NumPy

+

Numba translates a subset of Python and NumPy code into fast machine code using LLVM. By simply adding @njit, you can achieve C/Fortran-like speeds for math-heavy loops that cannot be vectorized with pure NumPy.

+ +

Concurrency Models in Python

+ + + + + +
ModelBest for...Mechanism
ThreadingI/O-bound (APIs, DBs)Concurrent but not parallel (GIL)
MultiprocessingCPU-bound (Training, Math)True parallelism (separate OS processes)
asyncioHigh-concurrency I/OSingle-threaded cooperative multitasking
+ +

Vectorization & SIMD

+

Single Instruction, Multiple Data (SIMD) allows a CPU to perform the same operation on multiple data points in one clock cycle. Modern NumPy leverages AVX-512 and MKL/OpenBLAS to ensure your a + b is as fast as the hardware allows.

+ +

Cython โ€” When All Else Fails

+

Cython is a superset of Python that compiles to C. It allows you to call C functions directly and use static typing. Use it for complex algorithms that require low-level memory control (e.g., custom tree models or graph algorithms).

+
+ `, + code: ` +
+

๐Ÿ’ป Performance & Optimization Code Examples

+ +

Numba โ€” JIT Speedups

+
+from numba import njit +import numpy as np + +# This loop is 100x slower in pure Python +@njit(parallel=True) +def monte_carlo_pi(nsamples): + acc = 0 + for i in range(nsamples): +x = np.random.random() +y = np.random.random() +if x**2 + y**2 < 1.0: + acc += 1 + return 4.0 * acc / nsamples +
+ +

Multiprocessing for Data Prep

+
+from multiprocessing import Pool + +def heavy_image_prep(file_path): + # Complex transform logic here + return processed_img + +# Use all available cores +if __name__ == '__main__': + with Pool() as p: +results = p.map(heavy_image_prep, all_files) +
+ +

Memory Optimization with __slots__

+
+class Observation: + # Prevents creation of __dict__, saving significant RAM + __slots__ = ('timestamp', 'value', 'sensor_id') + + def __init__(self, ts, val, sid): +self.timestamp = ts +self.value = val +self.sensor_id = sid + +# 1 million instances: ~60MB vs ~160MB without __slots__ +data = [Observation(i, i*1.1, 'S1') for i in range(1000000)] +
+
+ `, + interview: ` +
+

๐ŸŽฏ Performance Interview Questions

+ +
+ Q1: Why does Python have a GIL? +

Answer: It simplifies implementation by making the memory management (reference counting) thread-safe without needing granular locks. It also makes single-threaded code faster and C-extension integration easier. Removing it is difficult because it effectively requires a rewrite of the interpreter (see: "no-gil" Python 3.13 proposal).

+
+ +
+ Q2: How do you optimize a function with a nested loop? +

Answer: (1) Vectorize with NumPy (broadcast), (2) If logic is too complex for NumPy, use Numba JIT, (3) Use Cython if you need C-level types, (4) Use multiprocessing if the iterations are independent and CPU-bound.

+
+ +
+ Q3: Explain the "cProfile" overhead. +

Answer: cProfile is a deterministic profiler; it hooks into every function call. While very accurate, it adds significant overhead (sometimes 2x slowdown). For production systems, "Sampling Profilers" (like Py-Spy) are better as they only inspect the stack every few milliseconds, adding negligible overhead.

+
+ +
+ Q4: When is Threading faster than Multiprocessing? +

Answer: For I/O-bound tasks (Network/Disk). Threading has much lower overhead (shared memory) compared to Multiprocessing (separate memory spaces, requires serialization/pickling of data between processes). For downloading 1000 images, threads are superior.

+
+ +
+ Q5: What is "Cache Locality" and how does NumPy help? +

Answer: CPUs are fastest when accessing contiguous memory (Spatial Locality). NumPy's C-contiguous arrays ensure that when one value is loaded into the CPU cache, the next values are also loaded, minimizing "Cache Misses" compared to Python lists of scattered objects.

+
+
+ ` + } +}; + +// Render dashboard cards +function renderDashboard() { + const grid = document.getElementById('modulesGrid'); + grid.innerHTML = modules.map(module => ` +
+
${module.icon}
+

${module.title}

+

${module.description}

+ ${module.category} +
+ `).join(''); +} + +// Show specific module +function showModule(moduleId) { + const module = modules.find(m => m.id === moduleId); + const content = MODULE_CONTENT[moduleId]; + + document.getElementById('dashboard').classList.remove('active'); + + const moduleHTML = ` +
+ +
+

${module.icon} ${module.title}

+

${module.description}

+
+ +
+ + + +
+ +
${content.concepts}
+
${content.code}
+
${content.interview}
+
+ `; + + document.getElementById('modulesContainer').innerHTML = moduleHTML; +} + +// Switch tabs +function switchTab(moduleId, tabName, e) { + const moduleEl = document.getElementById(`module-${moduleId}`); + + // Update tab buttons + moduleEl.querySelectorAll('.tab-btn').forEach(btn => btn.classList.remove('active')); + if (e && e.target) { + e.target.classList.add('active'); + } else { + // Fallback: find the button by tab name + const tabNames = ['concepts', 'code', 'interview']; + const idx = tabNames.indexOf(tabName); + if (idx !== -1) moduleEl.querySelectorAll('.tab-btn')[idx]?.classList.add('active'); + } + + // Update tab content + moduleEl.querySelectorAll('.tab').forEach(tab => tab.classList.remove('active')); + document.getElementById(`${moduleId}-${tabName}`).classList.add('active'); +} + +// Back to dashboard +function backToDashboard() { + document.querySelectorAll('.module').forEach(m => m.remove()); + document.getElementById('dashboard').classList.add('active'); +} + +// Initialize + +document.addEventListener('DOMContentLoaded', renderDashboard);