Log GRPO Completions to 🤗 Datasets

huggingface

trl

datasets

Log GRPO training completions from trl to a 🤗 Dataset repo for easy analysis

Author

Daniel van Strien

Published

February 20, 2025

During GRPO training, it can be useful to stare at your completions and try and understand how the different reward functions are behaving. This notebook shows an experimental branch of trl which pushes completions to a 🤗 Dataset repo.

You can also play with this in Colab

This results in a dataset that looks like this:

Code

from IPython.display import HTML

HTML('''
<iframe
  src="https://huggingface.co/datasets/davanstrien/test-logs/embed/viewer/default/train"
  frameborder="0"
  width="100%"
  height="560px">
</iframe>
''')

Once you have this data in a HF repo, you can go on an work with it with whatever tools you prefer.

To try this out install from this fork of TRL

NOTE this is an experiment so don’t expect everything to work super well!

!pip install git+https://github.com/davanstrien/trl.git@log-data
!pip install polars hvplot altair--upgrade

This is basically the same as the example code in the GRPO docs in TRL. I just add some extra reward functions so we can see what the outputs for multiple rewards look like.

The main things we add are: - log_completions=True - log_completions_hub_repo='davanstrien/test-logs'

The first option will enable the logging of completions (this goes to WandB too) and the second option will push the completions to a 🤗 Dataset repo.

Note at the moment we don’t overwrite the dataset if it already exists on the Hub.

from datasets import load_dataset
from trl import GRPOConfig, GRPOTrainer

dataset = load_dataset("trl-lib/tldr", split="train")

def reward_len(completions, **kwargs):
    return [-abs(20 - len(completion)) for completion in completions]

def reward_shouting(completions: list[str], **kwargs) -> list[float]:
    """Reward text completions where all alphabetic words are in uppercase letters."""
    results = []

    for completion in completions:
        words = completion.split()
        all_uppercase = True

        for word in words:
            # Extract only alphabetic characters
            alpha_only = ''.join(char for char in word if char.isalpha())

            # Skip empty strings or strings with no alphabetic characters
            if not alpha_only:
                continue

            # Check if the alphabetic part is uppercase
            if not alpha_only.isupper():
                all_uppercase = False
                break

        results.append(50.0 if all_uppercase else 0.0)

    return results

def reward_emojis(completions: list[str], **kwargs) -> list[float]:
    """Reward text completions that contain emojis, with extra points for 🤗."""
    results = []

    for completion in completions:
        # Base score - check if any emoji exists in the completion
        has_any_emoji = any(char for char in completion if ord(char) > 127000)
        base_score = 10.0 if has_any_emoji else 0.0

        # Bonus points for 🤗 (hugging face emoji)
        hugging_face_count = completion.count('🤗')
        bonus_score = hugging_face_count * 5.0

        # Total score
        total_score = base_score + bonus_score
        results.append(total_score)

    return results


training_args = GRPOConfig(output_dir="Qwen2-0.5B-GRPO",
                           logging_steps=1,
                           per_device_train_batch_size=4,
                           per_device_eval_batch_size=4,
                           num_generations=2,
                           log_completions=True, 
                           max_steps=300,
                           log_completions_hub_repo='davanstrien/test-logs') # repo to push completions to
trainer = GRPOTrainer(
    model="Qwen/Qwen2-0.5B-Instruct",
    reward_funcs=[reward_len, reward_shouting, reward_emojis],
    args=training_args,
    train_dataset=dataset,
)
trainer.train()

Visualizing the data

Once we’ve finishes training we can work with the completions dataset using Polars, Pandas, etc. This can help give us more insight into how the different reward functions are behaving and how we might modify our traing process.

import polars as pl

# Login using e.g. `huggingface-cli login` to access this dataset
df = pl.read_parquet("hf://datasets/davanstrien/test-logs/**/*.parquet")
df.describe()

We can already see that we need to do something different if we want more shouting. We can see that the emojis reward is triggered sometimes but not that often.

We can also do things like plot the length reward over time.

import polars as pl
import altair as alt

df_sorted = df.sort("step")
df_with_avg = df_sorted.with_columns(
    pl.col("reward_reward_len")
    .rolling_mean(window_size=10, min_samples=1)
    .alias("rolling_avg")
)

# Now create separate charts and combine them
scatter_chart = (
    alt.Chart(df_with_avg)
    .mark_circle(opacity=0.6)
    .encode(
        x=alt.X("step:Q", title="Step"),
        y=alt.Y("reward_reward_len:Q", title="Length Reward Score"),
        tooltip=["step", "reward_reward_len"],
    )
)

line_chart = (
    alt.Chart(df_with_avg)
    .mark_line(color="purple", size=2)
    .encode(x="step:Q", y="rolling_avg:Q")
)

combined_chart = (
    (scatter_chart + line_chart)
    .properties(
        width=800,
        height=400,
        title="Length Rewards Over Time with 10-step Moving Average",
    )
    .configure_axis(labelFontSize=12, titleFontSize=14)
)

combined_chart

Create a heatmap showing when different reward types contribute most


heatmap_data = df.with_columns([
    pl.col('step').cast(pl.Int32) // 10 * 10
]).group_by('step').agg([
    pl.mean('reward_reward_len').alias('avg_len_reward'),
    pl.mean('reward_reward_shouting').alias('avg_shouting_reward'),
    pl.mean('reward_reward_emojis').alias('avg_emoji_reward')
])

heatmap_long = heatmap_data.unpivot(
    index=['step'],
    on=['avg_len_reward', 'avg_shouting_reward', 'avg_emoji_reward'],
    variable_name='reward_type',
    value_name='value'
)

heatmap = alt.Chart(heatmap_long).mark_rect().encode(
    x='step:O',
    y='reward_type:N',
    color=alt.Color('value:Q', scale=alt.Scale(scheme='viridis'))
).properties(
    width=800,
    height=200,
    title='Reward Types Intensity Over Time'
)
heatmap

Create histograms to see distribution of rewards

histogram = alt.Chart(df).transform_fold(
    ['reward', 'reward_reward_len', 'reward_reward_shouting', 'reward_reward_emojis'],
    as_=['Reward Type', 'Value']
).mark_bar(opacity=0.7).encode(
    alt.X('Value:Q', bin=alt.Bin(maxbins=30)),
    alt.Y('count():Q'),
    alt.Color('Reward Type:N'),
    alt.Row('Reward Type:N')
).properties(
    width=600,
    height=150,
    title='Distribution of Different Reward Types'
)
histogram

Is this a good idea?

This was hacked together in a hour or so but I’m curious if other people think this could be useful?

I think opening up the completions logs more widely could allow for more analysis and more insights into how training is progressing even for those without the compute resources to run training.