Deep Learning — Neural Networks at Scale
Deep learning is a branch of machine learning that uses neural networks with many layers to learn hierarchical representations from raw data.
Key Architectures
- Convolutional networks (CNNs) — images and vision.
- Recurrent networks (RNNs, LSTMs) — sequences.
- Transformers — language models, vision transformers, multimodal AI.
- Generative models — GANs, VAEs, diffusion models.
Training Essentials
Stochastic gradient descent, backpropagation, regularisation, learning-rate schedules, batch normalisation and modern optimisers like AdamW.
Code Examples: Deep Learning (5 runnable snippets)
Copy any block into a file or notebook and run it end-to-end — each example stands alone.
Example 1: Fine-tune a classifier head on frozen embeddings
# Example 1: Fine-tune a classifier head on frozen embeddings -- Deep Learning
import torch
from torch import nn
torch.manual_seed(0)
emb_dim = 384
train_emb = torch.randn(800, emb_dim)
train_y = torch.randint(0, 4, (800,))
head = nn.Sequential(nn.Dropout(0.1), nn.Linear(emb_dim, 4))
opt = torch.optim.AdamW(head.parameters(), lr=3e-4, weight_decay=1e-2)
loss_fn = nn.CrossEntropyLoss()
for step in range(200):
idx = torch.randint(0, len(train_emb), (64,))
logits = head(train_emb[idx])
loss = loss_fn(logits, train_y[idx])
opt.zero_grad(); loss.backward(); opt.step()
if step % 40 == 0:
acc = (logits.argmax(1) == train_y[idx]).float().mean()
print(f"step {step:3d} loss={loss.item():.3f} acc={acc.item():.3f}")
Example 2: Autoencoder for anomaly detection
# Example 2: Autoencoder for anomaly detection -- Deep Learning
import torch
from torch import nn
torch.manual_seed(0)
normal = torch.randn(1_000, 16) # training
anomaly = torch.randn(50, 16) * 3 + 4 # held-out outliers
class AE(nn.Module):
def __init__(self, d=16, h=4):
super().__init__()
self.enc = nn.Sequential(nn.Linear(d, 8), nn.ReLU(), nn.Linear(8, h))
self.dec = nn.Sequential(nn.Linear(h, 8), nn.ReLU(), nn.Linear(8, d))
def forward(self, x): return self.dec(self.enc(x))
ae = AE()
opt = torch.optim.Adam(ae.parameters(), lr=1e-3)
for epoch in range(40):
loss = ((ae(normal) - normal) ** 2).mean()
opt.zero_grad(); loss.backward(); opt.step()
err_normal = ((ae(normal) - normal) ** 2).mean(dim=1).detach()
err_anomaly = ((ae(anomaly) - anomaly) ** 2).mean(dim=1).detach()
print(f"normal median error : {err_normal.median():.3f}")
print(f"anomaly median error : {err_anomaly.median():.3f}")
Example 3: Self-attention from scratch in NumPy
# Example 3: Self-attention from scratch in NumPy -- Deep Learning
import numpy as np
rng = np.random.default_rng(0)
T, d_model, d_k = 6, 16, 8 # sequence length, dims
x = rng.standard_normal((T, d_model))
Wq = rng.standard_normal((d_model, d_k)) / np.sqrt(d_model)
Wk = rng.standard_normal((d_model, d_k)) / np.sqrt(d_model)
Wv = rng.standard_normal((d_model, d_k)) / np.sqrt(d_model)
Q, K, V = x @ Wq, x @ Wk, x @ Wv
scores = Q @ K.T / np.sqrt(d_k)
weights = np.exp(scores - scores.max(axis=-1, keepdims=True))
weights = weights / weights.sum(axis=-1, keepdims=True)
out = weights @ V
print("attention matrix (rounded):\n", np.round(weights, 2))
print("\noutput shape :", out.shape)
Example 4: PyTorch MLP training loop
# Example 4: PyTorch MLP training loop -- Deep Learning
import torch
from torch import nn
from torch.utils.data import DataLoader, TensorDataset
torch.manual_seed(0)
X = torch.randn(2_000, 20)
w = torch.randn(20, 1)
y = (X @ w + 0.3 * torch.randn(2_000, 1) > 0).float()
loader = DataLoader(TensorDataset(X, y), batch_size=64, shuffle=True)
model = nn.Sequential(
nn.Linear(20, 64), nn.ReLU(),
nn.Linear(64, 32), nn.ReLU(),
nn.Linear(32, 1),
)
opt = torch.optim.Adam(model.parameters(), lr=1e-3)
loss_fn = nn.BCEWithLogitsLoss()
for epoch in range(5):
total = 0.0
for xb, yb in loader:
opt.zero_grad()
loss = loss_fn(model(xb), yb)
loss.backward()
opt.step()
total += loss.item() * xb.size(0)
print(f"epoch {epoch+1}: loss = {total/len(loader.dataset):.4f}")
Example 5: Keras CNN for MNIST
# Example 5: Keras CNN for MNIST -- Deep Learning
import tensorflow as tf
from tensorflow.keras import layers, models
(x_tr, y_tr), (x_te, y_te) = tf.keras.datasets.mnist.load_data()
x_tr = x_tr[..., None] / 255.0
x_te = x_te[..., None] / 255.0
model = models.Sequential([
layers.Conv2D(32, 3, activation="relu", input_shape=(28, 28, 1)),
layers.MaxPool2D(),
layers.Conv2D(64, 3, activation="relu"),
layers.GlobalAveragePooling2D(),
layers.Dense(10, activation="softmax"),
])
model.compile(optimizer="adam",
loss="sparse_categorical_crossentropy",
metrics=["accuracy"])
model.fit(x_tr, y_tr, epochs=3, batch_size=128, validation_split=0.1)
print("test acc:", round(model.evaluate(x_te, y_te, verbose=0)[1], 4))