% pip install huggingface_hub backoff wordcloud tabulate toolz matplotlib
Requirement already satisfied: huggingface_hub in ./.venv/lib/python3.11/site-packages (0.15.1)
Requirement already satisfied: backoff in ./.venv/lib/python3.11/site-packages (2.2.1)
Requirement already satisfied: wordcloud in ./.venv/lib/python3.11/site-packages (1.9.2)
Requirement already satisfied: tabulate in ./.venv/lib/python3.11/site-packages (0.9.0)
Requirement already satisfied: filelock in ./.venv/lib/python3.11/site-packages (from huggingface_hub) (3.12.2)
Requirement already satisfied: fsspec in ./.venv/lib/python3.11/site-packages (from huggingface_hub) (2023.6.0)
Requirement already satisfied: requests in ./.venv/lib/python3.11/site-packages (from huggingface_hub) (2.31.0)
Requirement already satisfied: tqdm>=4.42.1 in ./.venv/lib/python3.11/site-packages (from huggingface_hub) (4.65.0)
Requirement already satisfied: pyyaml>=5.1 in ./.venv/lib/python3.11/site-packages (from huggingface_hub) (6.0)
Requirement already satisfied: typing-extensions>=3.7.4.3 in ./.venv/lib/python3.11/site-packages (from huggingface_hub) (4.7.1)
Requirement already satisfied: packaging>=20.9 in ./.venv/lib/python3.11/site-packages (from huggingface_hub) (23.1)
Requirement already satisfied: numpy>=1.6.1 in ./.venv/lib/python3.11/site-packages (from wordcloud) (1.25.0)
Requirement already satisfied: pillow in ./.venv/lib/python3.11/site-packages (from wordcloud) (10.0.0)
Requirement already satisfied: matplotlib in ./.venv/lib/python3.11/site-packages (from wordcloud) (3.7.1)
Requirement already satisfied: contourpy>=1.0.1 in ./.venv/lib/python3.11/site-packages (from matplotlib->wordcloud) (1.1.0)
Requirement already satisfied: cycler>=0.10 in ./.venv/lib/python3.11/site-packages (from matplotlib->wordcloud) (0.11.0)
Requirement already satisfied: fonttools>=4.22.0 in ./.venv/lib/python3.11/site-packages (from matplotlib->wordcloud) (4.40.0)
Requirement already satisfied: kiwisolver>=1.0.1 in ./.venv/lib/python3.11/site-packages (from matplotlib->wordcloud) (1.4.4)
Requirement already satisfied: pyparsing>=2.3.1 in ./.venv/lib/python3.11/site-packages (from matplotlib->wordcloud) (3.1.0)
Requirement already satisfied: python-dateutil>=2.7 in ./.venv/lib/python3.11/site-packages (from matplotlib->wordcloud) (2.8.2)
Requirement already satisfied: charset-normalizer<4,>=2 in ./.venv/lib/python3.11/site-packages (from requests->huggingface_hub) (3.1.0)
Requirement already satisfied: idna<4,>=2.5 in ./.venv/lib/python3.11/site-packages (from requests->huggingface_hub) (3.4)
Requirement already satisfied: urllib3<3,>=1.21.1 in ./.venv/lib/python3.11/site-packages (from requests->huggingface_hub) (2.0.3)
Requirement already satisfied: certifi>=2017.4.17 in ./.venv/lib/python3.11/site-packages (from requests->huggingface_hub) (2023.5.7)
Requirement already satisfied: six>=1.5 in ./.venv/lib/python3.11/site-packages (from python-dateutil>=2.7->matplotlib->wordcloud) (1.16.0)
Note: you may need to restart the kernel to use updated packages.
from huggingface_hub import list_datasets
from toolz import valmap, countby, groupby, topk, valmap
from wordcloud import WordCloud
import matplotlib.pyplot as plt
Load datasets
datasets = list (iter (list_datasets(full= True , sort= "downloads" , direction=- 1 )))
def get_lang(dataset):
card_data = dataset.cardData
if card_data:
lang = card_data.get("language" )
if lang is None :
return False
if len (lang) >= 1 :
return True
if not card_data:
return "No card data"
has_lang = groupby(get_lang, datasets)
dict_keys([True, 'No card data', False, None])
has_language_freqs = countby(get_lang, datasets)
has_language_percents = valmap(
lambda x: round (x / sum (has_language_freqs.values()) * 100 , ndigits= 2 ),
has_language_freqs,
)
plt.style.use("ggplot" )
data = {True : 13.31 , "No card data" : 42.03 , False : 44.5 }
# Convert the keys to strings
keys = [str (key) for key in data]
# Separate the values from the dictionary
values = list (data.values())
# Create a bar chart
plt.bar(keys, values)
# Set the labels for x and y axes
plt.ylabel("Percent" )
# Set the title of the chart
plt.title("Has language information?" )
# Display the chart
plt.show()
Top ten most frequent languages amounts
countby(count_langs, with_lang).items(), key= lambda x: x[1 ]
topk(10 , countby(count_langs, with_lang).items(), key= lambda x: x[1 ])
((1, 5107),
(2, 662),
(3, 76),
(4, 35),
(6, 28),
(11, 26),
(7, 23),
(5, 20),
(12, 19),
(10, 18))
highest number of languages
max (countby(count_langs, with_lang).keys())
Get languages
def get_langs(dataset):
return dataset.cardData.get("language" )
from toolz import concat, frequencies
lang_freqs = frequencies(concat(get_langs(d) for d in with_lang))
Number of unique languages specified on the hub
---------------------------------------------------------------------------
ModuleNotFoundError Traceback (most recent call last)
Cell In[19], line 1
----> 1 import pandas as pd
ModuleNotFoundError : No module named 'pandas'
df = pd.DataFrame({"Language" : lang_freqs.keys(), "Frequency" : lang_freqs.values()})
df
0
en
3949
1
ja
221
2
ace
19
3
acm
6
4
acq
5
...
...
...
1714
mr-
1
1715
xx
1
1716
nbl
2
1717
sep
1
1718
ssw
2
1719 rows × 2 columns
df.sort_values("Frequency" , ascending= False ).iloc[:20 ].set_index("Language" ).plot.bar()
df["Percent" ] = (df.Frequency / df.Frequency.sum () * 100 ).round (3 )
print (
df.sort_values("Frequency" , ascending= False )
.iloc[:20 ]
.set_index("Language" )
.to_markdown()
)
| Language | Frequency | Percent |
|:-----------|------------:|----------:|
| en | 3949 | 19.04 |
| fr | 394 | 1.9 |
| zh | 390 | 1.88 |
| es | 358 | 1.726 |
| de | 350 | 1.687 |
| ru | 333 | 1.606 |
| pt | 238 | 1.147 |
| it | 229 | 1.104 |
| ja | 221 | 1.066 |
| pl | 207 | 0.998 |
| ar | 195 | 0.94 |
| ko | 184 | 0.887 |
| nl | 182 | 0.877 |
| tr | 156 | 0.752 |
| vi | 155 | 0.747 |
| sv | 153 | 0.738 |
| id | 149 | 0.718 |
| hi | 148 | 0.714 |
| th | 145 | 0.699 |
| fi | 144 | 0.694 |
df = df.drop(columns= "Percent" )
df.sort_values("Frequency" , ascending= False ).iloc[:20 ].set_index("Language" ).plot.barh()
fig, ax = plt.subplots(figsize= (10 , 6 ))
df[df.Language != "en" ].sort_values("Frequency" , ascending= False ).iloc[:50 ].set_index(
"Language"
).plot.bar(ax= ax)
ax.set_xlabel("Language" )
ax.set_ylabel("Frequency" )
ax.set_title("Top 50 Languages (excluding English)" )
plt.show()
from matplotlib.colors import LinearSegmentedColormap
# Define Hugging Face brand colors
hugging_face_colors = ["#FFD21E" , "#FF9D00" , "#6B7280" ]
# Create custom colormap
color_map = LinearSegmentedColormap.from_list("hugging_face" , hugging_face_colors)
wordcloud = WordCloud(width= 800 , height= 400 , colormap= color_map)
wordcloud.generate_from_frequencies(lang_freqs)
plt.figure(figsize= (10 , 5 ))
plt.imshow(wordcloud, interpolation= "bilinear" )
plt.axis("off" )
plt.show()