Build State-of-the-Art Language Models | From Theory to Production
this is your complete guide to building state-of-the-art large language models. from foundational transformer architecture to deployment at scale. everything you need to go from research papers to production models.
# simplified attention implementation
import torch
import torch.nn.functional as F
def attention(Q, K, V, mask=None):
d_k = Q.size(-1)
scores = torch.matmul(Q, K.transpose(-2, -1)) / torch.sqrt(torch.tensor(d_k))
if mask is not None:
scores = scores.masked_fill(mask == 0, -1e9)
attention_weights = F.softmax(scores, dim=-1)
return torch.matmul(attention_weights, V)
| Dataset | Size | Type | Link |
|---|---|---|---|
| The Pile | 825 GB | diverse text corpus | view |
| RedPajama | 1.2 TB | LLaMA training data replica | view |
| RefinedWeb | 5 TB | high-quality web data | view |
| FineWeb | 15 TB | deduplicated web corpus | view |
| The Stack | 6 TB | code (permissive licenses) | view |
| Dataset | Size | Purpose |
|---|---|---|
| OpenAssistant | 161K | multi-turn conversations |
| UltraChat | 1.5M | diverse dialogues |
| Dolly 15K | 15K | human-generated instructions |
# train BPE tokenizer with HuggingFace
from tokenizers import Tokenizer, models, trainers, pre_tokenizers
tokenizer = Tokenizer(models.BPE())
tokenizer.pre_tokenizer = pre_tokenizers.ByteLevel(add_prefix_space=False)
trainer = trainers.BpeTrainer(
vocab_size=50000,
special_tokens=["", "", "", "", ""]
)
tokenizer.train(files=["corpus.txt"], trainer=trainer)
tokenizer.save("tokenizer.json")
# typical pre-training config
{
"hidden_size": 4096,
"num_layers": 32,
"num_heads": 32,
"intermediate_size": 11008,
"vocab_size": 50000,
"max_position_embeddings": 4096,
"learning_rate": 3e-4,
"warmup_steps": 2000,
"max_steps": 500000,
"batch_size": 256,
"gradient_accumulation_steps": 16,
"weight_decay": 0.1,
"bf16": true
}
# DPO training with TRL
from trl import DPOTrainer, DPOConfig
config = DPOConfig(
learning_rate=5e-7,
beta=0.1, # KL penalty
num_train_epochs=3,
per_device_train_batch_size=4,
)
trainer = DPOTrainer(
model=model,
ref_model=ref_model,
args=config,
train_dataset=preference_dataset,
tokenizer=tokenizer,
)
trainer.train()
# FSDP training
from torch.distributed.fsdp import FullyShardedDataParallel as FSDP
model = FSDP(
model,
mixed_precision=MixedPrecision(
param_dtype=torch.bfloat16,
reduce_dtype=torch.bfloat16,
),
sharding_strategy="FULL_SHARD",
device_id=torch.cuda.current_device(),
)
# train normally - FSDP handles sharding automatically
# LoRA with PEFT
from peft import LoraConfig, get_peft_model
lora_config = LoraConfig(
r=16, # rank
lora_alpha=32,
target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
lora_dropout=0.05,
task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
# only ~0.5% params trainable, huge memory savings
| Benchmark | Measures | Link |
|---|---|---|
| MMLU | multitask accuracy (57 subjects) | view |
| HellaSwag | commonsense reasoning | view |
| TruthfulQA | truthfulness, avoiding misconceptions | view |
| GSM8K | grade school math reasoning | view |
| HumanEval | code generation (python) | view |
# lm-eval-harness
pip install lm-eval
lm_eval --model hf \
--model_args pretrained=your-model \
--tasks mmlu,hellaswag,gsm8k,humaneval \
--device cuda \
--batch_size 8 \
--output_path results/
Vaswani et al., Google
introduced transformer architecture. self-attention, multi-head attention, positional encodings.
read paper →Brown et al., OpenAI
demonstrated emergent in-context learning at 175B parameters. scaling laws.
read paper →Hoffmann et al., DeepMind
chinchilla scaling laws. most LLMs undertrained. ~20 tokens per parameter optimal.
read paper →Dao et al., Stanford
IO-aware attention algorithm. 2-4x speedup with no approximation.
read paper →Touvron et al., Meta
RMSNorm, SwiGLU, RoPE. strong performance at 7B-65B scale.
read paper →Hu et al., Microsoft
parameter-efficient fine-tuning. train 0.1% of params with minimal quality loss.
read paper →Ouyang et al., OpenAI
RLHF methodology. reward modeling + PPO for alignment.
read paper →Rafailov et al., Stanford
train on preferences directly without reward model. simpler than RLHF.
read paper →Bai et al., Anthropic
self-critique and principle-based feedback. scalable oversight.
read paper →# train.py
import torch
from transformers import (
AutoModelForCausalLM,
AutoTokenizer,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)
from datasets import load_dataset
# load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
"base-model",
torch_dtype=torch.bfloat16,
use_cache=False,
)
model.gradient_checkpointing_enable()
tokenizer = AutoTokenizer.from_pretrained("base-model")
# load and tokenize dataset
dataset = load_dataset("your-dataset")
def tokenize(examples):
return tokenizer(
examples["text"],
truncation=True,
max_length=2048,
)
tokenized = dataset.map(tokenize, batched=True)
# training arguments
args = TrainingArguments(
output_dir="./model",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=16,
learning_rate=3e-4,
warmup_steps=2000,
lr_scheduler_type="cosine",
bf16=True,
gradient_checkpointing=True,
logging_steps=100,
save_steps=1000,
)
# train
trainer = Trainer(
model=model,
args=args,
train_dataset=tokenized["train"],
data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)
trainer.train()
trainer.save_model()
# instruction tuning with TRL
from trl import SFTTrainer
# format dataset
def format_instruction(example):
return {
"text": f"### Instruction:\n{example['instruction']}\n\n### Response:\n{example['output']}"
}
dataset = dataset.map(format_instruction)
# train
trainer = SFTTrainer(
model=model,
args=args,
train_dataset=dataset["train"],
dataset_text_field="text",
max_seq_length=2048,
)
trainer.train()
# launch with accelerate accelerate config # run once to configure accelerate launch train.py # or use torchrun torchrun --nproc_per_node=8 train.py # or DeepSpeed deepspeed train.py --deepspeed ds_config.json
# start vLLM server
python -m vllm.entrypoints.openai.api_server \
--model your-model \
--tensor-parallel-size 2 \
--dtype bfloat16 \
--max-model-len 4096
# use OpenAI-compatible client
from openai import OpenAI
client = OpenAI(base_url="http://localhost:8000/v1", api_key="token")
response = client.chat.completions.create(
model="your-model",
messages=[{"role": "user", "content": "hello!"}]
)