import pandas as pd
from vizro import Vizro
import vizro.models as vm
import vizro.plotly.express as px
# ===============================
# Load main downloads dataset
# ===============================
url = "https://raw.githubusercontent.com/DimedS/kedro-pycafe-data/main/data/02_intermediate/pypi_kedro_downloads.csv"
df = pd.read_csv(url)
df.columns = [c.strip().lower() for c in df.columns] # project, date, count
df["date"] = pd.to_datetime(df["date"], format="%Y-%m-%d")
# Aggregate by month
df_monthly = (
df.groupby([df["project"], df["date"].dt.to_period("M")])["count"]
.sum()
.reset_index()
)
df_monthly["date"] = df_monthly["date"].dt.to_timestamp()
df_monthly["year"] = df_monthly["date"].dt.year
df_monthly["month"] = df_monthly["date"].dt.month_name().str[:3]
# ===============================
# Page 1: Regular monthly trend + Yearly summaries
# ===============================
# Aggregate yearly downloads
df_yearly = (
df.groupby([df["project"], df["date"].dt.year])["count"]
.sum()
.reset_index()
.rename(columns={"date": "year", "count": "downloads"})
)
# Line chart: monthly trend
monthly_chart = vm.Graph(
id="monthly_chart",
figure=px.line(
df_monthly,
x="date",
y="count",
color="project",
markers=True,
title="Monthly PyPI Downloads (Kedro vs Kedro-Viz)",
labels={
"date": "Month",
"count": "Downloads",
"project": "Project",
},
),
)
# Convert downloads into millions and round to 1 decimal
df_yearly["downloads_mln"] = (df_yearly["downloads"] / 1_000_000).round(1)
# Bar chart: yearly totals for Kedro
kedro_yearly = df_yearly[df_yearly["project"] == "kedro"]
kedro_bar = vm.Graph(
id="kedro_yearly",
figure=px.bar(
kedro_yearly,
x="year",
y="downloads_mln",
text="downloads_mln",
title="Kedro Yearly Downloads",
labels={
"year": "Year",
"downloads_mln": "Downloads (M)",
},
).update_traces(texttemplate="%{text:.1f}", textposition="outside"),
)
# Bar chart: yearly totals for Kedro-Viz
viz_yearly = df_yearly[df_yearly["project"] == "kedro-viz"]
viz_bar = vm.Graph(
id="viz_yearly",
figure=px.bar(
viz_yearly,
x="year",
y="downloads_mln",
text="downloads_mln",
title="Kedro-Viz Yearly Downloads",
labels={
"year": "Year",
"downloads_mln": "Downloads (M)",
},
).update_traces(texttemplate="%{text:.1f}", textposition="outside"),
)
# Put line chart on left, bars stacked on right
page1 = vm.Page(
title="Monthly & Yearly Downloads",
layout=vm.Grid(grid=[[0, 1], [0, 2]]), # left = monthly trend, right top = Kedro bar, right bottom = Viz bar
components=[monthly_chart, kedro_bar, viz_bar],
)
# ===============================
# Page 2: YoY split charts
# ===============================
# Kedro only
kedro_yoy = df_monthly[df_monthly["project"] == "kedro"]
kedro_chart = vm.Graph(
id="kedro_yoy_chart",
figure=px.line(
kedro_yoy,
x="month",
y="count",
color="year",
markers=True,
title="Kedro: Year-over-Year Monthly Downloads",
labels={
"month": "Month",
"count": "Downloads",
"year": "Year",
},
),
)
# Kedro-Viz only
viz_yoy = df_monthly[df_monthly["project"] == "kedro-viz"]
viz_chart = vm.Graph(
id="viz_yoy_chart",
figure=px.line(
viz_yoy,
x="month",
y="count",
color="year",
markers=True,
title="Kedro-Viz: Year-over-Year Monthly Downloads",
labels={
"month": "Month",
"count": "Downloads",
"year": "Year",
},
),
)
page2 = vm.Page(
title="YoY Comparison",
layout=vm.Grid(grid=[[0, 1]]),
components=[kedro_chart, viz_chart],
)
# ===============================
# Page 3: New Monthly Users + MAU
# ===============================
# New Monthly Kedro Users (split by version)
url_new_kedro_users_monthly = (
"https://raw.githubusercontent.com/DimedS/kedro-pycafe-data/"
"refs/heads/main/data/02_intermediate/new_kedro_users_monthly.csv"
)
df_new_users = pd.read_csv(url_new_kedro_users_monthly)
df_new_users.columns = [c.strip().lower() for c in df_new_users.columns]
# Normalise column names in case of hidden spaces or BOM
df_new_users.columns = df_new_users.columns.str.replace(r"[^\w]+", "", regex=True)
# Verify columns
print("Columns:", df_new_users.columns.tolist())
# Parse date
df_new_users["first_year_month"] = pd.to_datetime(df_new_users["first_year_month"], format="%Y-%m")
# Ensure version column exists
if "maxversionprefix" in df_new_users.columns:
df_new_users.rename(columns={"maxversionprefix": "max_version_prefix"}, inplace=True)
new_users_chart = vm.Graph(
id="new_users_chart",
figure=px.bar(
df_new_users,
x="first_year_month",
y="count",
color="max_version_prefix",
title="New Monthly Kedro Users by Version",
labels={
"first_year_month": "Month",
"count": "New Users",
"max_version_prefix": "Kedro Version",
},
barmode="stack",
),
)
# Kedro MAU (per version)
url_mau_kedro = (
"https://raw.githubusercontent.com/DimedS/kedro-pycafe-data/"
"refs/heads/main/data/02_intermediate/mau_kedro.csv"
)
df_mau = pd.read_csv(url_mau_kedro)
df_mau.columns = [c.strip().lower() for c in df_mau.columns]
df_mau["year_month"] = pd.to_datetime(df_mau["year_month"], format="%Y-%m")
mau_chart = vm.Graph(
id="mau_chart",
figure=px.bar(
df_mau,
x="year_month",
y="mau",
color="max_version_prefix",
title="Kedro Monthly Active Users by Version",
labels={
"year_month": "Month",
"mau": "Monthly Active Users",
"max_version_prefix": "Kedro Version",
},
barmode="stack",
),
)
# Put both charts side by side
page3 = vm.Page(
title="Kedro Telemetry Users",
layout=vm.Grid(grid=[[0, 1]]),
components=[new_users_chart, mau_chart],
)
# ===============================
# Page 4: Kedro Commands & Plugins MAU
# ===============================
# --- Commands ---
url_kedro_commands_mau = "https://raw.githubusercontent.com/DimedS/kedro-pycafe-data/refs/heads/main/data/02_intermediate/kedro_commands_mau.csv"
df_cmds = pd.read_csv(url_kedro_commands_mau)
df_cmds.columns = [c.strip().lower() for c in df_cmds.columns]
df_cmds["Year-Month"] = pd.to_datetime(df_cmds["year_month"], format="%Y-%m")
df_cmds = df_cmds.rename(
columns={
"first_two_words": "Command",
"unique_users": "Unique Users"
}
)
commands_chart = vm.Graph(
id="commands_chart",
figure=px.line(
df_cmds,
x="Year-Month",
y="Unique Users",
color="Command",
markers=True,
title="Kedro Commands Usage (MAU by Command)",
labels={
"Year-Month": "Month",
"Unique Users": "Unique Users",
"Command": "Kedro Command"
},
),
)
# --- Plugins ---
url_kedro_plugins_mau = "https://raw.githubusercontent.com/DimedS/kedro-pycafe-data/refs/heads/main/data/02_intermediate/kedro_plugins_mau.csv"
df_plugins = pd.read_csv(url_kedro_plugins_mau)
df_plugins.columns = [c.strip().lower() for c in df_plugins.columns]
df_plugins["Year-Month"] = pd.to_datetime(df_plugins["year_month"], format="%Y-%m")
df_plugins = df_plugins.rename(
columns={
"first_two_words": "Plugin",
"unique_users": "Unique Users"
}
)
plugins_chart = vm.Graph(
id="plugins_chart",
figure=px.line(
df_plugins,
x="Year-Month",
y="Unique Users",
color="Plugin",
markers=True,
title="Kedro Plugins Usage (MAU by Plugin)",
labels={
"Year-Month": "Month",
"Unique Users": "Unique Users",
"Plugin": "Kedro Plugin"
},
),
)
# Page with both side by side
page4 = vm.Page(
title="Kedro Commands & Plugins Telemetry MAU",
layout=vm.Grid(grid=[[0, 1]]),
components=[commands_chart, plugins_chart],
)
# ===============================
# Page 5: Downloads by Country (Kedro & Kedro-Viz, 2025)
# ===============================
import pandas as pd
import vizro.models as vm
import vizro.plotly.express as px
from plotly import colors
# --- Load and prepare data ---
url_downloads_by_country = (
"https://raw.githubusercontent.com/DimedS/kedro-pycafe-data/refs/heads/main/data/02_intermediate/downloads_by_country.csv"
)
df_geo = pd.read_csv(url_downloads_by_country)
df_geo.columns = [c.strip().lower() for c in df_geo.columns] # country_code, count
# ISO-2 → ISO-3 mapping
iso2_to_iso3 = {
"US": "USA", "NL": "NLD", "IE": "IRL", "SG": "SGP", "AU": "AUS", "JP": "JPN", "DE": "DEU", "BE": "BEL", "BR": "BRA",
"CA": "CAN", "GB": "GBR", "PL": "POL", "IN": "IND", "FR": "FRA", "CH": "CHE", "SE": "SWE", "KR": "KOR", "FI": "FIN",
"RU": "RUS", "CN": "CHN", "ES": "ESP", "IT": "ITA", "CL": "CHL", "IL": "ISR", "PR": "PRI", "HK": "HKG", "TW": "TWN",
"TH": "THA", "PT": "PRT", "CO": "COL", "AE": "ARE", "MX": "MEX", "AT": "AUT", "VN": "VNM", "ZA": "ZAF", "DK": "DNK",
"CZ": "CZE", "ID": "IDN", "AR": "ARG", "PE": "PER", "TR": "TUR", "IR": "IRN", "UA": "UKR", "HU": "HUN", "GE": "GEO",
"PH": "PHL", "LV": "LVA", "NO": "NOR", "GT": "GTM", "SK": "SVK", "PA": "PAN", "MT": "MLT", "CR": "CRI", "KZ": "KAZ",
"GR": "GRC", "EG": "EGY", "MY": "MYS", "EE": "EST", "LT": "LTU", "NZ": "NZL", "TG": "TGO", "BG": "BGR", "RO": "ROU",
"LK": "LKA", "TN": "TUN", "EC": "ECU", "SA": "SAU", "SI": "SVN", "RS": "SRB", "PK": "PAK", "UY": "URY", "MA": "MAR",
"LB": "LBN", "DO": "DOM", "HR": "HRV", "BD": "BGD", "KE": "KEN", "MU": "MUS", "IQ": "IRQ", "DZ": "DZA", "AM": "ARM",
"NG": "NGA", "IS": "ISL", "BY": "BLR", "AZ": "AZE", "UZ": "UZB", "BA": "BIH", "CY": "CYP", "GH": "GHA", "VE": "VEN",
"LU": "LUX", "QA": "QAT", "BO": "BOL", "ML": "MLI", "JO": "JOR", "ET": "ETH", "CI": "CIV", "ZW": "ZWE", "NP": "NPL",
"UG": "UGA", "GI": "GIB", "MG": "MDG", "EU": "EUU", "KH": "KHM", "CD": "COD", "NC": "NCL", "PY": "PRY", "MK": "MKD",
"CM": "CMR", "HN": "HND", "AL": "ALB", "AO": "AGO", "MM": "MMR", "GF": "GUF", "MW": "MWI", "MD": "MDA", "BH": "BHR",
"MC": "MCO", "TT": "TTO", "OM": "OMN", "LI": "LIE", "BB": "BRB", "JM": "JAM", "KG": "KGZ", "BW": "BWA", "SV": "SLV",
"NI": "NIC", "FO": "FRO", "PS": "PSE", "IM": "IMN", "GA": "GAB", "CG": "COG", "NA": "NAM", "SC": "SYC", "SN": "SEN",
"ME": "MNE", "KW": "KWT"
}
df_geo["iso3"] = df_geo["country_code"].map(iso2_to_iso3)
# --- Compute totals ---
usa_downloads = df_geo.loc[df_geo["country_code"] == "US", "count"].sum()
usa_millions = round(usa_downloads / 1_000_000, 1)
# Exclude USA for map clarity
df_geo = df_geo[df_geo["country_code"] != "US"].dropna(subset=["iso3"])
# Add metrics
df_geo["downloads_k"] = (df_geo["count"] / 1_000).round(1)
total_ex_us = df_geo["count"].sum()
df_geo["percent"] = (df_geo["count"] / total_ex_us * 100).round(1)
# --- Improved colour scale for better contrast ---
contrasted_scale = colors.sequential.Tealgrn
# --- Choropleth map ---
fig_map = px.choropleth(
df_geo,
locations="iso3",
locationmode="ISO-3",
color="downloads_k",
color_continuous_scale=contrasted_scale,
projection="natural earth",
title=(
f"Kedro & Kedro-Viz Downloads by Country (2025). "
f"<sup>Colour scale excludes USA for visibility (USA: {usa_millions}M downloads)</sup>"
),
labels={"downloads_k": "Downloads (k)", "percent": "Share (%)"},
hover_data=["downloads_k", "percent"],
)
fig_map.update_layout(
paper_bgcolor="#111111",
plot_bgcolor="#111111",
font_color="#F2F2F2",
geo=dict(
bgcolor="#D9D9D9", showframe=False, showcountries=True,
countrycolor="#3C3C3C", landcolor="#1A1A1A", lakecolor="#BFBFBF", coastlinecolor="#444444"
),
coloraxis_colorbar=dict(title="Downloads (k)", tickcolor="#F2F2F2"),
)
# --- Complementary bar chart: top 15 countries ---
top_countries = df_geo.sort_values("count", ascending=False).head(15)
fig_bar = px.bar(
top_countries,
x="downloads_k",
y="iso3",
orientation="h",
text="downloads_k",
# remove color=... so we can control manually below
title="Top 15 Countries",
labels={"downloads_k": "Downloads (k)", "iso3": "Country"},
)
# Manually colour the bars (using the same Tealgrn scale)
from plotly import colors
contrasted_scale = colors.sequential.Tealgrn
bar_colors = [
contrasted_scale[int(i * (len(contrasted_scale) - 1) / (len(top_countries) - 1))]
for i in range(len(top_countries))
]
fig_bar.update_traces(
marker_color=bar_colors,
texttemplate="%{text}k",
textposition="outside",
)
fig_bar.update_layout(
paper_bgcolor="#111111",
plot_bgcolor="#111111",
font_color="#F2F2F2",
yaxis=dict(categoryorder="total ascending"),
)
# --- Vizro Page (map 75%, bar chart 25%) ---
page5 = vm.Page(
title="Downloads by Country",
layout=vm.Grid(grid=[[0, 0, 0, 0, 1]]), # map spans 3 cells, bar spans 1 cell
components=[
vm.Graph(id="geo_downloads_absolute_map", figure=fig_map),
vm.Graph(id="top_countries_bar", figure=fig_bar),
],
)
# ===============================
# Combine all pages
# ===============================
dashboard = vm.Dashboard(pages=[page1, page2, page3, page4, page5])
Vizro().build(dashboard).run()