08: PyTorch Profiling¶
This notebook is an experiment to try out the PyTorch profiler.
See here for more:
import torch
import torchvision
from torch import nn
from torchvision import transforms, datasets
from torchinfo import summary
import numpy as np
import matplotlib.pyplot as plt
from going_modular import data_setup, engine
Setup device¶
device = "cuda" if torch.cuda.is_available() else "cpu"
device
'cuda'
Get and load data¶
import os
import requests
from zipfile import ZipFile
def get_food_image_data():
if not os.path.exists("data/10_whole_foods"):
os.makedirs("data/", exist_ok=True)
# Download data
data_url = "https://storage.googleapis.com/food-vision-image-playground/10_whole_foods.zip"
print(f"Downloading data from {data_url}...")
requests.get(data_url)
# Unzip data
targ_dir = "data/10_whole_foods"
print(f"Extracting data to {targ_dir}...")
with ZipFile("10_whole_foods.zip") as zip_ref:
zip_ref.extractall(targ_dir)
else:
print("data/10_whole_foods dir exists, skipping download")
get_food_image_data()
data/10_whole_foods dir exists, skipping download
# Setup dirs
train_dir = "data/10_whole_foods/train"
test_dir = "data/10_whole_foods/test"
# Setup ImageNet normalization levels (turns all images into similar distribution as ImageNet)
normalize = transforms.Normalize(mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225])
# Create starter transform
simple_transform = transforms.Compose([
transforms.Resize((224, 224)),
transforms.ToTensor(),
normalize
])
# Create data loaders
train_dataloader, test_dataloader, class_names = data_setup.create_dataloaders(
train_dir=train_dir,
test_dir=test_dir,
transform=simple_transform,
batch_size=32,
num_workers=8
)
train_dataloader, test_dataloader, class_names
(<torch.utils.data.dataloader.DataLoader at 0x7f052ab26e20>, <torch.utils.data.dataloader.DataLoader at 0x7f05299eed00>, ['apple', 'banana', 'beef', 'blueberries', 'carrots', 'chicken_wings', 'egg', 'honey', 'mushrooms', 'strawberries'])
Load model¶
model = torchvision.models.efficientnet_b0(pretrained=True).to(device)
# model
# Update the classifier
model.classifier = torch.nn.Sequential(
nn.Dropout(p=0.2),
nn.Linear(1280, len(class_names)).to(device))
# Freeze all base layers
for param in model.features.parameters():
param.requires_grad = False
Train model and track results¶
# Define loss and optimizer
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
Adjust training function to track results with SummaryWriter¶
model.name = "EfficietNetB0"
model.name
'EfficietNetB0'
from torch.utils.tensorboard import SummaryWriter
from going_modular.engine import train_step, test_step
from tqdm import tqdm
writer = SummaryWriter()
Update the train_step() function to include the PyTorch profiler.
def train_step(model, dataloader, loss_fn, optimizer):
model.train()
train_loss, train_acc = 0, 0
## NEW: Add PyTorch profiler
dir_to_save_logs = os.path.join("logs", datetime.now().strftime("%Y-%m-%d-%H-%M"))
with torch.profiler.profile(
on_trace_ready=torch.profiler.tensorboard_trace_handler(dir_name=dir_to_save_logs),
# with_stack=True # this adds a lot of overhead to training (tracing all the stack)
):
for batch, (X, y) in enumerate(dataloader):
# Send data to GPU
X, y = X.to(device, non_blocking=True), y.to(device, non_blocking=True)
# Turn on mixed precision if available
with torch.autocast(device_type=device, enabled=True):
# 1. Forward pass
y_pred = model(X)
# 2. Calculate loss
loss = loss_fn(y_pred, y)
# 3. Optimizer zero grad
optimizer.zero_grad()
# 4. Loss backward
loss.backward()
# 5. Optimizer step
optimizer.step()
# 6. Calculate metrics
train_loss += loss.item()
y_pred_class = torch.softmax(y_pred, dim=1).argmax(dim=1)
# print(f"y: \n{y}\ny_pred_class:{y_pred_class}")
# print(f"y argmax: {y_pred.argmax(dim=1)}")
# print(f"Equal: {(y_pred_class == y)}")
train_acc += (y_pred_class == y).sum().item() / len(y_pred)
# print(f"batch: {batch} train_acc: {train_acc}")
# Adjust returned metrics
return train_loss / len(dataloader), train_acc / len(dataloader)
TK - Now to use the writer, we've got to adjust the train() function...
def train(
model,
train_dataloader,
test_dataloader,
optimizer,
loss_fn=nn.CrossEntropyLoss(),
epochs=5,
):
results = {"train_loss": [], "train_acc": [], "test_loss": [], "test_acc": []}
for epoch in tqdm(range(epochs)):
train_loss, train_acc = train_step(
model=model,
dataloader=train_dataloader,
loss_fn=loss_fn,
optimizer=optimizer,
)
test_loss, test_acc = test_step(
model=model, dataloader=test_dataloader, loss_fn=loss_fn
)
# Print out what's happening
print(
f"Epoch: {epoch+1} | "
f"train_loss: {train_loss:.4f} | "
f"train_acc: {train_acc:.4f} | "
f"test_loss: {test_loss:.4f} | "
f"test_acc: {test_acc:.4f}"
)
# Update results
results["train_loss"].append(train_loss)
results["train_acc"].append(train_acc)
results["test_loss"].append(test_loss)
results["test_acc"].append(test_acc)
# Add results to SummaryWriter
writer.add_scalars(main_tag="Loss",
tag_scalar_dict={"train_loss": train_loss,
"test_loss": test_loss},
global_step=epoch)
writer.add_scalars(main_tag="Accuracy",
tag_scalar_dict={"train_acc": train_acc,
"test_acc": test_acc},
global_step=epoch)
# Close the writer
writer.close()
return results
# Train model
# Note: Not using engine.train() since the original script isn't updated
results = train(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
optimizer=optimizer,
loss_fn=loss_fn,
epochs=5)
20%|██ | 1/5 [00:05<00:21, 5.27s/it]
Epoch: 1 | train_loss: 1.9644 | train_acc: 0.4386 | test_loss: 1.5205 | test_acc: 0.7865
40%|████ | 2/5 [00:09<00:14, 4.94s/it]
Epoch: 2 | train_loss: 1.2589 | train_acc: 0.7878 | test_loss: 1.1589 | test_acc: 0.7604
60%|██████ | 3/5 [00:14<00:09, 4.72s/it]
Epoch: 3 | train_loss: 0.8642 | train_acc: 0.8776 | test_loss: 0.9347 | test_acc: 0.7917
80%|████████ | 4/5 [00:18<00:04, 4.56s/it]
Epoch: 4 | train_loss: 0.6827 | train_acc: 0.8856 | test_loss: 0.6637 | test_acc: 0.8750
100%|██████████| 5/5 [00:23<00:00, 4.65s/it]
Epoch: 5 | train_loss: 0.5688 | train_acc: 0.9069 | test_loss: 0.6175 | test_acc: 0.8854
Looks like mixed precision doesn't offer much benefit for smaller feature extraction models...
# # Without mixed precision
# 20%|██ | 1/5 [00:03<00:14, 3.71s/it]Epoch: 1 | train_loss: 0.5229 | train_acc: 0.9054 | test_loss: 0.5776 | test_acc: 0.8542
# 40%|████ | 2/5 [00:07<00:11, 3.74s/it]Epoch: 2 | train_loss: 0.4699 | train_acc: 0.9001 | test_loss: 0.5160 | test_acc: 0.8802
# 60%|██████ | 3/5 [00:10<00:07, 3.63s/it]Epoch: 3 | train_loss: 0.3913 | train_acc: 0.9196 | test_loss: 0.4888 | test_acc: 0.8906
# 80%|████████ | 4/5 [00:14<00:03, 3.61s/it]Epoch: 4 | train_loss: 0.3724 | train_acc: 0.9371 | test_loss: 0.4931 | test_acc: 0.8698
# 100%|██████████| 5/5 [00:18<00:00, 3.61s/it]Epoch: 5 | train_loss: 0.3315 | train_acc: 0.9381 | test_loss: 0.4405 | test_acc: 0.8750
# # With mixed precision
# 20%|██ | 1/5 [00:04<00:17, 4.40s/it]Epoch: 1 | train_loss: 0.3027 | train_acc: 0.9554 | test_loss: 0.4386 | test_acc: 0.8802
# 40%|████ | 2/5 [00:08<00:13, 4.49s/it]Epoch: 2 | train_loss: 0.2826 | train_acc: 0.9539 | test_loss: 0.4080 | test_acc: 0.8802
# 60%|██████ | 3/5 [00:13<00:08, 4.48s/it]Epoch: 3 | train_loss: 0.2450 | train_acc: 0.9609 | test_loss: 0.4130 | test_acc: 0.8750
# 80%|████████ | 4/5 [00:18<00:04, 4.53s/it]Epoch: 4 | train_loss: 0.2450 | train_acc: 0.9594 | test_loss: 0.4158 | test_acc: 0.8802
# 100%|██████████| 5/5 [00:22<00:00, 4.49s/it]Epoch: 5 | train_loss: 0.2307 | train_acc: 0.9639 | test_loss: 0.4124 | test_acc: 0.8906
Try mixed precision with larger model¶
Now we'll try turn on mixed precision with a larger model (e.g. EffifientNetB0 with all layers tuneable).
# Unfreeze all base layers
for param in model.features.parameters():
param.requires_grad = True
# for param in model.features.parameters():
# print(param.requires_grad)
# Train model
# Note: Not using engine.train() since the original script isn't updated
results = train(model=model,
train_dataloader=train_dataloader,
test_dataloader=test_dataloader,
optimizer=optimizer,
loss_fn=loss_fn,
epochs=5)
20%|██ | 1/5 [00:13<00:53, 13.27s/it]
Epoch: 1 | train_loss: 0.4934 | train_acc: 0.8586 | test_loss: 0.6467 | test_acc: 0.7969
40%|████ | 2/5 [00:27<00:42, 14.09s/it]
Epoch: 2 | train_loss: 0.1750 | train_acc: 0.9628 | test_loss: 1.1806 | test_acc: 0.8385
60%|██████ | 3/5 [00:42<00:28, 14.28s/it]
Epoch: 3 | train_loss: 0.1362 | train_acc: 0.9619 | test_loss: 0.5831 | test_acc: 0.8802
80%|████████ | 4/5 [00:57<00:14, 14.47s/it]
Epoch: 4 | train_loss: 0.1743 | train_acc: 0.9462 | test_loss: 0.5702 | test_acc: 0.8854
100%|██████████| 5/5 [01:11<00:00, 14.38s/it]
Epoch: 5 | train_loss: 0.2437 | train_acc: 0.9352 | test_loss: 0.7096 | test_acc: 0.8125
# # Without mixed precision...
# 20%|██ | 1/5 [00:11<00:46, 11.61s/it]Epoch: 1 | train_loss: 0.4507 | train_acc: 0.8648 | test_loss: 1.0603 | test_acc: 0.7604
# 40%|████ | 2/5 [00:24<00:36, 12.21s/it]Epoch: 2 | train_loss: 0.1659 | train_acc: 0.9464 | test_loss: 0.6398 | test_acc: 0.8490
# 60%|██████ | 3/5 [00:36<00:24, 12.38s/it]Epoch: 3 | train_loss: 0.1261 | train_acc: 0.9698 | test_loss: 0.7149 | test_acc: 0.8542
# 80%|████████ | 4/5 [00:49<00:12, 12.53s/it]Epoch: 4 | train_loss: 0.1250 | train_acc: 0.9609 | test_loss: 0.7441 | test_acc: 0.7917
# 100%|██████████| 5/5 [01:02<00:00, 12.42s/it]Epoch: 5 | train_loss: 0.1282 | train_acc: 0.9564 | test_loss: 0.8701 | test_acc: 0.8385
# # With mixed precision...
# 20%|██ | 1/5 [00:13<00:53, 13.27s/it]Epoch: 1 | train_loss: 0.4934 | train_acc: 0.8586 | test_loss: 0.6467 | test_acc: 0.7969
# 40%|████ | 2/5 [00:27<00:42, 14.09s/it]Epoch: 2 | train_loss: 0.1750 | train_acc: 0.9628 | test_loss: 1.1806 | test_acc: 0.8385
# 60%|██████ | 3/5 [00:42<00:28, 14.28s/it]Epoch: 3 | train_loss: 0.1362 | train_acc: 0.9619 | test_loss: 0.5831 | test_acc: 0.8802
# 80%|████████ | 4/5 [00:57<00:14, 14.47s/it]Epoch: 4 | train_loss: 0.1743 | train_acc: 0.9462 | test_loss: 0.5702 | test_acc: 0.8854
# 100%|██████████| 5/5 [01:11<00:00, 14.38s/it]Epoch: 5 | train_loss: 0.2437 | train_acc: 0.9352 | test_loss: 0.7096 | test_acc: 0.8125
Checking the PyTorch profiler, it seems that mixed precision utilises some Tensor Cores, however, these aren't large numbers.
E.g. it uses 9-12% Tensor Cores. Perhaps the slow down when using mixed precision is because the tensors have to get altered and converted when there isn't very many of them. For example only 9-12% of tensors get converted so the speed up gains aren't realised on these tensors because they get cancelled out by the conversion time.
Extensions¶
- Does changing the data input size to EfficientNetB4 change its results? E.g. input image size of (380, 380) instead of (224, 224)?