How to Fix CUDA Out of Memory in PyTorch
Reduce batch size first. Then try: torch.cuda.amp.autocast() for mixed precision, gradient checkpointing, or torch.no_grad() for inference.
Fix 1: Reduce Batch Size (Easiest)
# Halving batch size roughly halves activation memory
train_loader = DataLoader(dataset, batch_size=16) # was 32
Fix 2: Mixed Precision Training (30-40% savings)
scaler = torch.cuda.amp.GradScaler()
for x, y in train_loader:
optimizer.zero_grad()
with torch.cuda.amp.autocast(): # uses float16 for activations
output = model(x)
loss = criterion(output, y)
scaler.scale(loss).backward()
scaler.step(optimizer)
scaler.update()
Fix 3: Gradient Checkpointing (50-80% activation savings)
from torch.utils.checkpoint import checkpoint
class MyModel(nn.Module):
def forward(self, x):
# Recompute activations during backward instead of storing
x = checkpoint(self.block1, x, use_reentrant=False)
x = checkpoint(self.block2, x, use_reentrant=False)
return self.head(x)
Fix 4: torch.no_grad() for Inference
# Saves ~60% memory by not storing computation graph
with torch.no_grad():
output = model(input_tensor)
Fix 5: Gradient Accumulation
# Simulate batch_size=32 with actual batch_size=8
accumulation_steps = 4
for i, (x, y) in enumerate(train_loader):
output = model(x) # batch_size=8
loss = criterion(output, y) / accumulation_steps
loss.backward()
if (i + 1) % accumulation_steps == 0:
optimizer.step()
optimizer.zero_grad()
Quick Memory Check
# See how much VRAM you're using
print(f"Allocated: {torch.cuda.memory_allocated() / 1e9:.2f} GB")
print(f"Cached: {torch.cuda.memory_reserved() / 1e9:.2f} GB")
# Force clear cache (rarely helps, but worth trying)
torch.cuda.empty_cache()