Code
from IPython.display import HTML
'''
HTML(<iframe
src="https://huggingface.co/datasets/davanstrien/test-logs/embed/viewer/default/train"
frameborder="0"
width="100%"
height="560px">
</iframe>
''')
Daniel van Strien
February 20, 2025
During GRPO training, it can be useful to stare at your completions and try and understand how the different reward functions are behaving. This notebook shows an experimental branch of trl which pushes completions to a 🤗 Dataset repo.
You can also play with this in Colab
This results in a dataset that looks like this:
Once you have this data in a HF repo, you can go on an work with it with whatever tools you prefer.
To try this out install from this fork of TRL
NOTE this is an experiment so don’t expect everything to work super well!
This is basically the same as the example code in the GRPO docs in TRL. I just add some extra reward functions so we can see what the outputs for multiple rewards look like.
The main things we add are: - log_completions=True
- log_completions_hub_repo='davanstrien/test-logs'
The first option will enable the logging of completions (this goes to WandB too) and the second option will push the completions to a 🤗 Dataset repo.
Note at the moment we don’t overwrite the dataset if it already exists on the Hub.
from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer
dataset = load_dataset("trl-lib/tldr", split="train")
def reward_len(completions, **kwargs):
return [-abs(20 - len(completion)) for completion in completions]
def reward_shouting(completions: list[str], **kwargs) -> list[float]:
"""Reward text completions where all alphabetic words are in uppercase letters."""
results = []
for completion in completions:
words = completion.split()
all_uppercase = True
for word in words:
# Extract only alphabetic characters
alpha_only = ''.join(char for char in word if char.isalpha())
# Skip empty strings or strings with no alphabetic characters
if not alpha_only:
continue
# Check if the alphabetic part is uppercase
if not alpha_only.isupper():
all_uppercase = False
break
results.append(50.0 if all_uppercase else 0.0)
return results
def reward_emojis(completions: list[str], **kwargs) -> list[float]:
"""Reward text completions that contain emojis, with extra points for 🤗."""
results = []
for completion in completions:
# Base score - check if any emoji exists in the completion
has_any_emoji = any(char for char in completion if ord(char) > 127000)
base_score = 10.0 if has_any_emoji else 0.0
# Bonus points for 🤗 (hugging face emoji)
hugging_face_count = completion.count('🤗')
bonus_score = hugging_face_count * 5.0
# Total score
total_score = base_score + bonus_score
results.append(total_score)
return results
training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO",
logging_steps=1,
per_device_train_batch_size=4,
per_device_eval_batch_size=4,
num_generations=2,
log_completions=True,
max_steps=300,
log_completions_hub_repo='davanstrien/test-logs') # repo to push completions to
trainer = GRPOTrainer(
model="Qwen/Qwen2-0.5B-Instruct",
reward_funcs=[reward_len, reward_shouting, reward_emojis],
args=training_args,
train_dataset=dataset,
)
trainer.train()
Once we’ve finishes training we can work with the completions dataset using Polars, Pandas, etc. This can help give us more insight into how the different reward functions are behaving and how we might modify our traing process.
We can already see that we need to do something different if we want more shouting. We can see that the emojis reward is triggered sometimes but not that often.
We can also do things like plot the length reward over time.
import polars as pl
import altair as alt
df_sorted = df.sort("step")
df_with_avg = df_sorted.with_columns(
pl.col("reward_reward_len")
.rolling_mean(window_size=10, min_samples=1)
.alias("rolling_avg")
)
# Now create separate charts and combine them
scatter_chart = (
alt.Chart(df_with_avg)
.mark_circle(opacity=0.6)
.encode(
x=alt.X("step:Q", title="Step"),
y=alt.Y("reward_reward_len:Q", title="Length Reward Score"),
tooltip=["step", "reward_reward_len"],
)
)
line_chart = (
alt.Chart(df_with_avg)
.mark_line(color="purple", size=2)
.encode(x="step:Q", y="rolling_avg:Q")
)
combined_chart = (
(scatter_chart + line_chart)
.properties(
width=800,
height=400,
title="Length Rewards Over Time with 10-step Moving Average",
)
.configure_axis(labelFontSize=12, titleFontSize=14)
)
heatmap_data = df.with_columns([
pl.col('step').cast(pl.Int32) // 10 * 10
]).group_by('step').agg([
pl.mean('reward_reward_len').alias('avg_len_reward'),
pl.mean('reward_reward_shouting').alias('avg_shouting_reward'),
pl.mean('reward_reward_emojis').alias('avg_emoji_reward')
])
heatmap_long = heatmap_data.unpivot(
index=['step'],
on=['avg_len_reward', 'avg_shouting_reward', 'avg_emoji_reward'],
variable_name='reward_type',
value_name='value'
)
heatmap = alt.Chart(heatmap_long).mark_rect().encode(
x='step:O',
y='reward_type:N',
color=alt.Color('value:Q', scale=alt.Scale(scheme='viridis'))
).properties(
width=800,
height=200,
title='Reward Types Intensity Over Time'
)
heatmap
histogram = alt.Chart(df).transform_fold(
['reward', 'reward_reward_len', 'reward_reward_shouting', 'reward_reward_emojis'],
as_=['Reward Type', 'Value']
).mark_bar(opacity=0.7).encode(
alt.X('Value:Q', bin=alt.Bin(maxbins=30)),
alt.Y('count():Q'),
alt.Color('Reward Type:N'),
alt.Row('Reward Type:N')
).properties(
width=600,
height=150,
title='Distribution of Different Reward Types'
)
histogram
This was hacked together in a hour or so but I’m curious if other people think this could be useful?
I think opening up the completions logs more widely could allow for more analysis and more insights into how training is progressing even for those without the compute resources to run training.