教程:如何使用 RL 训练 gpt-oss
学习使用 GRPO 训练 OpenAI gpt-oss,在本地或 Colab 上自主击败 2048 游戏。
1
安装 Unsloth
!pip install --upgrade -qqq uv
try: import numpy; get_numpy = f"numpy=={numpy.__version__}"
except: get_numpy = "numpy"
!uv pip install -qqq \
"torch>=2.8.0" "triton>=3.4.0" {get_numpy} torchvision bitsandbytes "transformers==4.56.2" \
"unsloth_zoo[base] @ git+https://github.com/unslothai/unsloth-zoo" \
"unsloth[base] @ git+https://github.com/unslothai/unsloth" \
git+https://github.com/triton-lang/triton.git@05b2c186c1b6c9a08375389d5efe9cb4c401c075#subdirectory=python/triton_kernels
!uv pip install --upgrade --no-deps transformers==4.56.2 tokenizers
!uv pip install --no-deps trl==0.22.22
使用 Unsloth 加载 gpt-oss
from unsloth import FastLanguageModel
import torch
max_seq_length = 768 # 如果你的任务需要更长的输出,请增加
lora_rank = 4 # 更高的秩 → 更好但需要更多 VRAM/计算
model, tokenizer = FastLanguageModel.from_pretrained(
model_name = "unsloth/gpt-oss-20b", # 或在 H100 上使用 unsloth/gpt-oss-20b-BF16
max_seq_length = max_seq_length,
load_in_4bit = True, # 16 位时设为 False
offload_embedding = True, # 约节省 ~1GB VRAM
)
model = FastLanguageModel.get_peft_model(
model,
r = lora_rank,
target_modules = [
"q_proj", "k_proj", "v_proj", "o_proj",
"gate_proj", "up_proj", "down_proj",
],
lora_alpha = lora_rank * 2,
use_gradient_checkpointing = "unsloth", # 大幅节省内存
random_state = 3407,
)3
4
安全的代码执行与反作弊检查
from unsloth import check_python_modules ok, info = check_python_modules(""" def strategy(board): import math from typing import Callable return "W" """) # ok == True 表示仅使用了 Python 级别的导入sample = """ def strategy(board): from numpy import matmul return "W" """ ok, info = check_python_modules(sample) # ok => Falsefrom unsloth import create_locked_down_function function = """ def add(a, b): def adder(a): return a + b return adder(b) + b """ f = create_locked_down_function(function) # 如果使用全局变量 / 导入会出错from unsloth import execute_with_time_limit @execute_with_time_limit(2) def execute_strategy(strategy, game): # 循环直到游戏结束或超时 ...
5
使用仅包含原生 Python 代码的新短 2048 策略。
给你一个当前棋盘状态的数字列表的列表。
输出在下一步最优动作的一个动作,使用 "W", "A", "S", "D" 中的一项。
按下面的格式将你的新简短函数放在反引号中输出:
```python
def strategy(board):
return "W" # 示例
创建一个小型合成数据集(重复使用相同的提示)并计算提示长度,以便 GRPO 知道要采样多少完成令牌:
```python
from datasets import Dataset
prompt = ... # 如上
maximum_length = len(tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}], add_generation_prompt=True
))
dataset = Dataset.from_list([
{"prompt": [{"role": "user", "content": prompt}], "answer": 0, "reasoning_effort": "low"}
] * 1000)奖励函数时间!
def extract_function(text): if text.count("```") >= 2: first = text.find("```") + 3 second = text.find("```", first) fx = text[first:second].strip() fx = fx.removeprefix("python\n") fx = fx[fx.find("def"):] if fx.startswith("def strategy(board):"): return fx return Nonefrom unsloth import create_locked_down_function, check_python_modules def function_works(completions, **kwargs): scores = [] for completion in completions: response = completion[0]["content"] function = extract_function(response) if function is None: scores.append(-2.0) continue ok, info = check_python_modules(function) if "error" in info: scores.append(-2.0) continue try: _ = create_locked_down_function(function) scores.append(1.0) except Exception: scores.append(-0.5) return scoresdef no_cheating(completions, **kwargs): scores = [] for completion in completions: response = completion[0]["content"] function = extract_function(response) if function is None: scores.append(-1.0) continue ok, _ = check_python_modules(function) scores.append(1.0 if ok else -20.0) # 若作弊则重罚 return scoresimport numpy as np PRINTER = 0 # 偶尔打印以便调试 def strategy_succeeds(completions, **kwargs): global PRINTER scores = [] seed = np.random.randint(10000) for completion in completions: response = completion[0]["content"] function = extract_function(response) if function is None: scores.append(-2.0) continue try: new_strategy = create_locked_down_function(function) except Exception: scores.append(0.0) continue try: game = GameBoard(size=6, seed=seed, target=2048, probability_fours=0.10) steps, state = execute_strategy(new_strategy, game) if PRINTER % 5 == 0: print(function) print(f"Steps={steps} State={state}") print(game.board().pretty()) PRINTER += 1 if state == "success": scores.append(20.0) else: scores.append(2.0) # 有效但未达到 2048 except TimeoutError: scores.append(-1.0) # 超时 except Exception: scores.append(-3.0) # 崩溃 return scores
配置 GRPO
from trl import GRPOConfig, GRPOTrainer
max_prompt_length = maximum_length + 1
max_completion_length = max_seq_length - max_prompt_length
training_args = GRPOConfig(
temperature=1.0,
learning_rate=5e-5,
weight_decay=0.01,
warmup_ratio=0.1,
lr_scheduler_type="linear",
optim="adamw_8bit",
logging_steps=1,
per_device_train_batch_size=1,
gradient_accumulation_steps=1, # 若需更平滑的奖励信号可增至 4
num_generations=2, # 如果 OOM 请降低
max_prompt_length=max_prompt_length,
max_completion_length=max_completion_length,
max_steps=1000, # 或设置 num_train_epochs=1
save_steps=100,
report_to="none",
output_dir="outputs",
)
trainer = GRPOTrainer(
model=model,
processing_class=tokenizer,
reward_funcs=[function_works, no_cheating, strategy_succeeds],
args=training_args,
train_dataset=dataset,
# 可选的评估拆分:
# train_dataset=new_dataset["train"],
# eval_dataset=new_dataset["test"],
)训练你的模型
trainer.train()推理(训练后)
from transformers import TextStreamer
text = tokenizer.apply_chat_template(
[{"role": "user", "content": prompt}],
tokenize=False,
add_generation_prompt=True,
reasoning_effort="low",
)
_ = model.generate(
**tokenizer(text, return_tensors="pt").to("cuda"),
temperature=1.0,
max_new_tokens=1024,
streamer=TextStreamer(tokenizer, skip_prompt=False)保存 / 导出你的微调模型
model.save_pretrained_merged("finetuned_model", tokenizer, save_method="merged_16bit") # 或推送 model.push_to_hub_merged("<org_or_user>/<repo>", tokenizer, token="<hf_token>", save_method="merged_16bit")
最后更新于
这有帮助吗?

