AI Infrastructure 2026: GPU Optimization and Cost-Effective Scaling

GPU costs dominate AI budgets. This guide covers practical strategies for optimizing GPU utilization, reducing costs, and building efficient AI infrastructure at scale.

The GPU Challenge

Modern AI workloads face several infrastructure challenges:

Cost: A100/H100 GPUs cost $2-4/hour
Scarcity: Limited availability during peak demand
Utilization: Average GPU utilization is only 30-50%
Complexity: Managing distributed workloads is hard

GPU Memory Optimization

Model Quantization

from transformers import AutoModelForCausalLM, BitsAndBytesConfig

# 4-bit quantization for 75% memory reduction
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True
)

model = AutoModelForCausalLM.from_pretrained(
    "meta-llama/Llama-2-70b-hf",
    quantization_config=bnb_config,
    device_map="auto"
)

# Memory comparison:
# FP16: ~140GB (needs 2x A100 80GB)
# INT8: ~70GB (1x A100 80GB)  
# INT4: ~35GB (1x A100 40GB or RTX 4090)

Gradient Checkpointing

from torch.utils.checkpoint import checkpoint

class MemoryEfficientModel(nn.Module):
    def __init__(self, base_model):
        super().__init__()
        self.base_model = base_model
        self.base_model.gradient_checkpointing_enable()
    
    def forward(self, x):
        # Trades compute for memory
        return checkpoint(self.base_model, x, use_reentrant=False)

# Enable for transformers
model.gradient_checkpointing_enable()

Mixed Precision Training

from torch.cuda.amp import autocast, GradScaler

scaler = GradScaler()

for batch in dataloader:
    optimizer.zero_grad()
    
    with autocast(dtype=torch.bfloat16):
        outputs = model(batch)
        loss = criterion(outputs, targets)
    
    scaler.scale(loss).backward()
    scaler.step(optimizer)
    scaler.update()

Efficient Inference

vLLM for High-Throughput Serving

from vllm import LLM, SamplingParams

# PagedAttention for efficient memory management
llm = LLM(
    model="meta-llama/Llama-2-7b-chat-hf",
    tensor_parallel_size=2,  # Use 2 GPUs
    gpu_memory_utilization=0.9,
    max_model_len=4096
)

# Batch multiple requests
prompts = ["Question 1?", "Question 2?", "Question 3?"]
sampling_params = SamplingParams(temperature=0.7, max_tokens=256)

# Process batch efficiently
outputs = llm.generate(prompts, sampling_params)

Continuous Batching

from text_generation_server import Server

# TGI server with continuous batching
server = Server(
    model_id="meta-llama/Llama-2-7b-chat-hf",
    max_batch_total_tokens=32768,
    max_waiting_tokens=20,
    max_concurrent_requests=128
)

# Handles dynamic batching automatically
# New requests join existing batches

KV Cache Optimization

# Use Flash Attention for memory-efficient attention
from flash_attn import flash_attn_func

def efficient_attention(q, k, v):
    # Flash Attention: O(N) memory vs O(N²) for standard attention
    return flash_attn_func(q, k, v, causal=True)

# Or use torch SDPA
import torch.nn.functional as F

output = F.scaled_dot_product_attention(
    query, key, value,
    attn_mask=None,
    dropout_p=0.0,
    is_causal=True
)

Cost Optimization Strategies

Spot/Preemptible Instances

# AWS Spot Instance pricing (example)
# On-demand p4d.24xlarge: $32.77/hour
# Spot p4d.24xlarge: ~$10-15/hour (60-70% savings)

import boto3

ec2 = boto3.client('ec2')

# Request spot instances
response = ec2.request_spot_instances(
    InstanceCount=1,
    LaunchSpecification={
        'ImageId': 'ami-deep-learning',
        'InstanceType': 'p4d.24xlarge',
        'SecurityGroups': ['ml-training'],
    },
    SpotPrice='15.00',  # Max price
    Type='one-time'
)

Checkpoint and Resume

import torch
from pathlib import Path

class CheckpointManager:
    def __init__(self, checkpoint_dir, save_interval=1000):
        self.checkpoint_dir = Path(checkpoint_dir)
        self.checkpoint_dir.mkdir(exist_ok=True)
        self.save_interval = save_interval
        
    def save(self, model, optimizer, step, loss):
        checkpoint = {
            'step': step,
            'model_state': model.state_dict(),
            'optimizer_state': optimizer.state_dict(),
            'loss': loss
        }
        path = self.checkpoint_dir / f"checkpoint_{step}.pt"
        torch.save(checkpoint, path)
        
        # Upload to S3 for durability
        self._upload_to_s3(path)
    
    def load_latest(self, model, optimizer):
        checkpoints = sorted(self.checkpoint_dir.glob("checkpoint_*.pt"))
        if not checkpoints:
            return 0
        
        checkpoint = torch.load(checkpoints[-1])
        model.load_state_dict(checkpoint['model_state'])
        optimizer.load_state_dict(checkpoint['optimizer_state'])
        return checkpoint['step']

# Usage in training loop
ckpt_manager = CheckpointManager("./checkpoints")
start_step = ckpt_manager.load_latest(model, optimizer)

for step in range(start_step, total_steps):
    loss = train_step(model, batch)
    
    if step % 1000 == 0:
        ckpt_manager.save(model, optimizer, step, loss)

Right-Sizing GPU Selection

# GPU Selection Guide
GPU_SPECS = {
    "T4": {"memory": 16, "fp16_tflops": 65, "cost_hr": 0.50},
    "A10G": {"memory": 24, "fp16_tflops": 125, "cost_hr": 1.00},
    "A100_40": {"memory": 40, "fp16_tflops": 312, "cost_hr": 2.00},
    "A100_80": {"memory": 80, "fp16_tflops": 312, "cost_hr": 3.50},
    "H100": {"memory": 80, "fp16_tflops": 990, "cost_hr": 4.00},
}

def recommend_gpu(model_size_gb, batch_size, task_type):
    """Recommend GPU based on requirements"""
    required_memory = model_size_gb * 1.2  # 20% overhead
    
    if task_type == "inference":
        required_memory *= 1.1  # Lower overhead
    elif task_type == "training":
        required_memory *= 2.5  # Gradients + optimizer states
    
    suitable = [
        (name, specs) for name, specs in GPU_SPECS.items()
        if specs["memory"] >= required_memory
    ]
    
    # Sort by cost efficiency
    return sorted(suitable, key=lambda x: x[1]["cost_hr"])[0]

# Example
gpu = recommend_gpu(model_size_gb=14, batch_size=32, task_type="inference")
print(f"Recommended: {gpu[0]}")  # Likely A10G or A100_40

Multi-GPU Strategies

Data Parallelism

import torch.distributed as dist
from torch.nn.parallel import DistributedDataParallel as DDP

def setup_ddp(rank, world_size):
    dist.init_process_group(
        backend="nccl",
        rank=rank,
        world_size=world_size
    )
    torch.cuda.set_device(rank)

def train_ddp(rank, world_size):
    setup_ddp(rank, world_size)
    
    model = MyModel().cuda(rank)
    model = DDP(model, device_ids=[rank])
    
    # Sampler ensures each GPU gets different data
    sampler = DistributedSampler(dataset, num_replicas=world_size, rank=rank)
    dataloader = DataLoader(dataset, sampler=sampler, batch_size=32)
    
    for epoch in range(epochs):
        sampler.set_epoch(epoch)
        for batch in dataloader:
            loss = model(batch.cuda(rank))
            loss.backward()
            optimizer.step()

# Launch with torchrun
# torchrun --nproc_per_node=4 train.py

Model Parallelism with DeepSpeed

import deepspeed

# ds_config.json
config = {
    "train_batch_size": 256,
    "gradient_accumulation_steps": 4,
    "fp16": {"enabled": True},
    "zero_optimization": {
        "stage": 3,  # Full model sharding
        "offload_optimizer": {"device": "cpu"},
        "offload_param": {"device": "cpu"}
    }
}

model, optimizer, _, _ = deepspeed.initialize(
    model=model,
    model_parameters=model.parameters(),
    config=config
)

for batch in dataloader:
    loss = model(batch)
    model.backward(loss)
    model.step()

Monitoring and Profiling

NVIDIA DCGM Metrics

# Prometheus scrape config for GPU metrics
scrape_configs:
  - job_name: 'dcgm'
    static_configs:
      - targets: ['localhost:9400']
    metrics_path: /metrics

# Key metrics to monitor
DCGM_METRICS = [
    "DCGM_FI_DEV_GPU_UTIL",          # GPU utilization %
    "DCGM_FI_DEV_MEM_COPY_UTIL",     # Memory bandwidth %
    "DCGM_FI_DEV_FB_USED",           # Memory used (MB)
    "DCGM_FI_DEV_POWER_USAGE",       # Power draw (W)
    "DCGM_FI_DEV_GPU_TEMP",          # Temperature (C)
    "DCGM_FI_DEV_SM_CLOCK",          # SM clock (MHz)
]

PyTorch Profiler

from torch.profiler import profile, ProfilerActivity, tensorboard_trace_handler

with profile(
    activities=[ProfilerActivity.CPU, ProfilerActivity.CUDA],
    schedule=torch.profiler.schedule(wait=1, warmup=1, active=3),
    on_trace_ready=tensorboard_trace_handler('./logs'),
    record_shapes=True,
    profile_memory=True,
    with_stack=True
) as prof:
    for step, batch in enumerate(dataloader):
        model(batch)
        prof.step()

# View in TensorBoard
# tensorboard --logdir=./logs

Infrastructure as Code

Terraform for GPU Clusters

resource "aws_instance" "gpu_training" {
  ami           = "ami-0123456789"  # Deep Learning AMI
  instance_type = "p4d.24xlarge"
  
  root_block_device {
    volume_size = 500
    volume_type = "gp3"
    iops        = 16000
    throughput  = 1000
  }
  
  # Use placement group for NCCL performance
  placement_group = aws_placement_group.ml_cluster.id
  
  tags = {
    Name = "gpu-training-node"
    Team = "ml-platform"
  }
}

resource "aws_placement_group" "ml_cluster" {
  name     = "ml-training-cluster"
  strategy = "cluster"  # Low latency networking
}

Best Practices Summary

Area	Recommendation
Memory	Use quantization (INT8/INT4) when possible
Training	Enable gradient checkpointing, mixed precision
Inference	Use vLLM/TGI with continuous batching
Cost	Spot instances + checkpointing for training
Scaling	Start with data parallelism, add model parallelism if needed
Monitoring	Track GPU utilization, memory, and throughput

Conclusion

Efficient AI infrastructure requires a holistic approach: optimize at the model level (quantization, efficient architectures), system level (batching, caching), and infrastructure level (right-sizing, spot instances). Start with measurement, identify bottlenecks, and optimize incrementally.

References

이 글이 도움이 되셨다면 공감 및 광고 클릭을 부탁드립니다 :)