How do to groupby for Hugging Face datasets
How can you groupby in Hugging Face datasets?
%pip install datasets polars
from datasets import load_dataset
import polars as pl
ds = load_dataset("argilla/databricks-dolly-15k-curated-en")
ds['train'][0]
df = pl.from_arrow(ds['train'].data.table)
grouped = df.groupby('category').agg(pl.col('original-instruction').str.lengths().alias('length').mean())
grouped
grouped = grouped.sort('length')
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('fivethirtyeight')
categories = grouped['category']
lengths = grouped['length']
plt.figure(figsize=(10, 6))
plt.bar(categories, lengths)
plt.xticks(rotation=45, ha='right')
plt.ylabel('Mean Length')
plt.title('original-instruction length for each category')
plt.tight_layout()
plt.savefig('groupby-fig.png')
plt.show()