Transform Your 20B Monster into a Pocket Rocket ๐
You've got OpenAI's gpt-oss-20b model - a reasoning beast that needs taming. Let's make it deployable!
Convert float16 weights to int4/int8. Get 4x size reduction in minutes!
Original Size: 15 GB
Compressed Size: 3.75 GB
Quality Loss: ~2%
# Quick Quantization with llama.cpp
pip install llama-cpp-python
# Convert to GGUF format
python convert.py openai/gpt-oss-20b \
--outfile gpt-oss-20b.gguf \
--outtype f16
# Quantize to 4-bit
./quantize gpt-oss-20b.gguf \
gpt-oss-20b-q4_k_m.gguf q4_k_m
# Test with Ollama
ollama create smol-gpt-oss \
-f ./Modelfile \
-m gpt-oss-20b-q4_k_m.gguf
ollama run smol-gpt-oss
Train a smaller model to mimic the large one. Takes days but preserves quality!
# Full Distillation Pipeline
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from datasets import load_dataset
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
# 1. Load Teacher Model (gpt-oss-20b)
teacher = AutoModelForCausalLM.from_pretrained(
"openai/gpt-oss-20b",
torch_dtype=torch.bfloat16,
device_map="auto"
)
# 2. Create Student Architecture (3B params)
from transformers import LlamaConfig
student_config = LlamaConfig(
hidden_size=2048, # vs 4096 in teacher
num_hidden_layers=24, # vs 40 in teacher
num_attention_heads=16, # vs 32 in teacher
intermediate_size=5504, # vs 11008 in teacher
vocab_size=32000
)
student = AutoModelForCausalLM.from_config(student_config)
# 3. Prepare Dataset
dataset = load_dataset("HuggingFaceH4/ultrachat_200k", split="train[:10000]")
# 4. Distillation Training Config
training_args = SFTConfig(
output_dir="./smol-gpt-oss",
num_train_epochs=3,
per_device_train_batch_size=4,
gradient_accumulation_steps=8,
learning_rate=2e-4,
warmup_ratio=0.03,
logging_steps=10,
save_strategy="epoch",
fp16=True,
gradient_checkpointing=True,
max_length=2048,
report_to="wandb"
)
# 5. Custom Distillation Loss
class DistillationTrainer(SFTTrainer):
def compute_loss(self, model, inputs, return_outputs=False):
# Student forward pass
student_outputs = model(**inputs)
# Teacher forward pass (no grad)
with torch.no_grad():
teacher_outputs = self.teacher(**inputs)
# KL divergence loss
import torch.nn.functional as F
loss = F.kl_div(
F.log_softmax(student_outputs.logits / 3.0, dim=-1),
F.softmax(teacher_outputs.logits / 3.0, dim=-1),
reduction='batchmean'
) * 9.0 # temp^2
return (loss, student_outputs) if return_outputs else loss
# 6. Train!
trainer = DistillationTrainer(
model=student,
teacher=teacher,
args=training_args,
train_dataset=dataset,
tokenizer=tokenizer
)
trainer.train()
# 7. Save & Quantize Further
student.save_pretrained("smol-gpt-oss-3b")
# Then quantize with llama.cpp for extra compression!
Remove unnecessary weights. Risky but can work with fine-tuning!
# Structured Pruning with SparseML
pip install sparseml transformers
from sparseml.transformers import SparseAutoModelForCausalLM
from sparseml.transformers.sparsification import create_pruning_recipe
# Load model with pruning support
model = SparseAutoModelForCausalLM.from_pretrained(
"openai/gpt-oss-20b",
recipe="pruning_recipe.yaml"
)
# Create pruning recipe (50% sparsity)
recipe = """
version: 1.0.0
modifiers:
- !GMPruningModifier
start_epoch: 0
end_epoch: 10
init_sparsity: 0.0
final_sparsity: 0.5
update_frequency: 100
params:
- "model.layers.*.mlp.experts.*.gate_up_proj.weight"
- "model.layers.*.mlp.experts.*.down_proj.weight"
- !QuantizationModifier
start_epoch: 10
scheme:
input_activations:
num_bits: 8
symmetric: true
weights:
num_bits: 4
symmetric: false
"""
# Save recipe
with open("pruning_recipe.yaml", "w") as f:
f.write(recipe)
# Apply pruning during fine-tuning
trainer.train()
# Export optimized model
model.save_pretrained("gpt-oss-pruned")
Based on the HuggingFace cookbook - make your model reason in multiple languages!
# Multilingual Reasoning Fine-tune (from the cookbook)
from transformers import AutoModelForCausalLM, AutoTokenizer
from peft import LoraConfig, get_peft_model
from trl import SFTTrainer, SFTConfig
from datasets import load_dataset
# Load model with MXFP4 quantization
model = AutoModelForCausalLM.from_pretrained(
"openai/gpt-oss-20b",
torch_dtype=torch.bfloat16,
quantization_config=Mxfp4Config(dequantize=True),
device_map="auto"
)
# LoRA config for MoE architecture
peft_config = LoraConfig(
r=8,
lora_alpha=16,
target_modules="all-linear",
target_parameters=[
"7.mlp.experts.gate_up_proj",
"7.mlp.experts.down_proj",
"15.mlp.experts.gate_up_proj",
"15.mlp.experts.down_proj",
"23.mlp.experts.gate_up_proj",
"23.mlp.experts.down_proj",
],
)
# Load multilingual dataset
dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")
# Train for multilingual reasoning
trainer = SFTTrainer(
model=get_peft_model(model, peft_config),
train_dataset=dataset,
args=SFTConfig(
output_dir="gpt-oss-multilingual",
num_train_epochs=1,
per_device_train_batch_size=4,
gradient_accumulation_steps=4,
learning_rate=2e-4,
max_length=2048,
push_to_hub=True
)
)
trainer.train()
# Test multilingual reasoning
messages = [
{"role": "system", "content": "reasoning language: German"},
{"role": "user", "content": "ยฟCuรกl es el capital de Australia?"}
]
# Model reasons in German, responds in Spanish!
Built for Lalo's SMOL Model Adventures ๐
Remember: Start with quantization, graduate to distillation