Categorizing Pitches in Baseball

How is pitch type detected today?

There are three key features of every pitch to watch for:

  1. Velocity - the speed of the pitch
  2. Movement - the general movement of the pitch
  3. Break - the shift in direction

…and by asking the pitcher what types he throws.



Source: Baseball pitches illustrated by Lokesh Dhakar

What types of pitches are there?

Fastballs

Four-seam

Two-seam

Cutter

Sinker

Four-seam fastball path

Breaking balls

Curveball

Slider

Slurve

Screwball

Curveball breaking ball path

Offspeed

Changeup

Palmball

Circle change

Splitter

Changeup offspeed pitch path

There’s another couple of pitches that don’t quite fall into any of those categories: eephus, knuckle, fork, gyro. Source: Baseball pitches illustrated by Lokesh Dhakar

How many pitch types exist?

Code
pitch.types.df <- data.all |>
  left_join(pitchtypes) |>
  mutate(pitch=factor(pitch)) |>
  group_by(pitch,category) |>
  count() |> 
  arrange(desc(n)) 

pitch.types.df |>
ggplot(aes(x=reorder(pitch,-n),y=n)) +
  geom_bar(stat="identity",fill="#999999") +
  geom_text(aes(label=pitch,y=0),angle=90,nudge_y = 5000,hjust=0)+
  labs(x="Pitch Type",
       y="Number of Pitches") +
  scale_y_continuous(labels=scales::label_comma()) +
  ggthemes::theme_hc() +
  theme(axis.ticks.x = element_blank(),
        axis.text.x = element_blank())
Distribution of pitch types in data

What data do we have to work with?

Variables that I use from the dataset:

  • Pitcher ID (pitcherId)
  • Pitch type (pitchType, pitch, category)
  • Release velocity (releaseVelocity)
  • Velocity along the X axis of the field at the first measurement (vx0, vy0, vz0)
  • Acceleration of the ball (ax, ay, az)
  • Spin rate (spinRate)
  • Spin direction (spinDir)
  • Distance from the horizontal center of the plate as the ball crosses the front plane of the plate (px)
  • Height above the ground as the ball crosses the front plane of the plate (pz)
  • Distance from the center of the plate at the first measurement (x0, y0, z0)

How many clusters should I use?

pitch.data.cv <- vfold_cv(pitch.data, v=5)
km.stats.spec <- k_means(mode="partition",engine="stats",num_clusters=tune())
pitch.data.recipe <- recipe(~ .,
                            data = pitch.data) |>
  step_rm(pitcherId, pitchType, pitch, category)
km.workflow <- workflow(pitch.data.recipe, km.stats.spec)
cluster.grid <- grid_regular(num_clusters(range=c(4,15)),
                             levels=12)
km.results <- tune_cluster(
  km.workflow,
  resamples=pitch.data.cv,
  grid=cluster.grid,
  control=control_grid(save_pred=TRUE, extract=identity),
  metrics=cluster_metric_set(sse_within_total, sse_total, sse_ratio)
)
km.performance <- km.results |> collect_metrics()
km.performance |>
  filter(.metric=="sse_ratio") |>
  ggplot(aes(x=num_clusters, y=mean)) +
  geom_point() +
  geom_line() +
  ggthemes::theme_hc() +
  labs(x="Number of Clusters",
       y="Mean WSS/TSS ratio over 5 folds") +
  scale_x_continuous(breaks=4:15)

Clustering pitches together

Code
n_clusts <- 10
km.spec.n <- k_means(mode="partition", engine="stats", num_clusters = n_clusts)

km.fit <- km.spec.n |>
  fit(~ . -pitcherId -pitchType -pitch -category, data=pitch.data)

km.fit |> tidy() |>  
  select(cluster, size, releaseVelocity, spinRate, spinDir, px, pz, x0, z0, vx0, vy0, vz0, ax, ay, az) |>
  knitr::kable(digits=2,format.args = list(big.mark = ",",scientific = FALSE)) |>
  kableExtra::kable_styling(latex_options = "scale_down")
cluster size releaseVelocity spinRate spinDir px pz x0 z0 vx0 vy0 vz0 ax ay az
1 253,185 90.56 2,156.25 189.05 -0.07 2.41 -0.70 5.87 2.35 -132.52 -5.38 -3.85 28.81 -18.65
2 250,592 89.57 1,981.08 190.69 -0.13 2.41 -0.90 5.86 2.80 -131.10 -4.75 -4.36 28.00 -21.01
3 235,132 84.37 1,263.99 157.67 -0.04 2.19 -0.99 5.92 2.12 -123.57 -3.16 0.87 24.32 -28.40
4 239,909 83.56 865.60 161.01 0.02 2.05 -0.76 5.92 1.71 -122.38 -3.09 1.29 23.55 -30.08
5 70,505 92.46 2,933.46 166.03 0.09 2.31 0.53 5.86 -2.16 -135.13 -7.12 5.36 31.59 -12.25
6 259,554 86.12 1,569.13 194.63 -0.18 2.29 -0.95 5.87 2.62 -126.11 -3.43 -3.37 25.66 -26.43
7 223,821 91.44 2,336.79 190.16 -0.01 2.39 -0.61 5.88 2.35 -133.76 -6.03 -4.10 29.62 -16.41
8 227,071 88.11 1,801.24 190.41 -0.17 2.36 -1.04 5.85 3.02 -129.00 -4.05 -4.27 27.02 -23.77
9 208,841 91.77 2,563.43 174.45 0.06 2.36 0.10 5.88 -0.33 -134.20 -6.51 1.27 30.24 -14.62
10 175,253 83.34 430.82 157.31 0.05 1.93 -0.61 5.91 1.48 -122.04 -3.09 0.89 23.57 -31.27

How did the model perform?

Code
dists |>
  group_by(category, pitchType,.cluster_pred) |>
  count() |>
  ggplot(aes(x=pitchType,
             y=.cluster_pred,
             fill=.cluster_pred,
             alpha=n,
             label=scales::label_number(accuracy=1,scale_cut = cut_short_scale())(n),
             group=category)) +
  geom_tile() +
  geom_text(aes(color=ifelse(n<15000,"white","black"))) +
  labs(x="Pitch Type",
       y="") +
  scale_fill_discrete(type="viridis") +
  scale_x_discrete(limits=rev) +
  scale_color_identity() +
  facet_grid(category~., scales="free", switch="y", space="free_y") +
  coord_flip() +
  guides(fill="none",alpha="none") +
  ggthemes::theme_hc() +
  theme(strip.placement="outside",
        strip.background = element_blank(),
        panel.grid.major.y = element_blank())

What about those pitches that don’t really fit?

Code
outliers <- dists |>
  group_by(.cluster_pred) |>
  mutate(outlier = ifelse(dist_from_centroid > quantile(dist_from_centroid, 0.75)+1.5*IQR(dist_from_centroid),"#DC3220","#212529")) |>
  mutate(outlier = ifelse(dist_from_centroid < quantile(dist_from_centroid, 0.25)-1.5*IQR(dist_from_centroid),"#DC3220",outlier)) 

outliers |>
  ggplot(aes(x=.cluster_pred,y=dist_from_centroid)) +
  geom_boxplot(outlier.shape=NA) +
  geom_jitter(data=outliers |> filter(outlier=="#DC3220"), aes(color=outlier), width=.15, alpha=.1) +
  labs(y="Distance from Centroid",
       x="") +
  scale_color_identity() +
  guides(color="none") +
  ggthemes::theme_hc()

What do those outliers look like?

Code
outliers |>
  filter(.cluster_pred=="Cluster 1") |>
  group_by(category,pitchType,outlier) |>
  count() |> group_by(outlier) |>
  mutate(pct=n/sum(n),
         pct=ifelse(outlier=="#212529",-1*pct,pct)) |>
  ggplot(aes(x=pitchType,y=pct,fill=outlier)) +
  geom_col() +
  facet_grid(category~., scales="free", switch="y", space="free_y") +
  scale_y_continuous(breaks = c(-1,-.5,0,.5,1),
                     limits = c(-1,1),
                     labels=c("100%","50%","0%","50%","100%")) +
  scale_fill_identity() +
  coord_flip() +
  labs(x="Pitch Type",
       y="",
       title="Proportion of Pitch Types assigned to Cluster 1") +
  ggthemes::theme_hc() +
  theme(strip.placement="outside",
        strip.background = element_blank())

Code
outliers |>
  filter(.cluster_pred=="Cluster 5") |>
  group_by(category,pitchType,outlier) |>
  count() |> group_by(outlier) |>
  mutate(pct=n/sum(n),
         pct=ifelse(outlier=="#212529",-1*pct,pct)) |>
  ggplot(aes(x=pitchType,y=pct,fill=outlier)) +
  geom_col() +
  facet_grid(category~., scales="free", switch="y", space="free_y") +
  scale_y_continuous(breaks = c(-1,-.5,0,.5,1),
                     limits = c(-1,1),
                     labels=c("100%","50%","0%","50%","100%")) +
  scale_fill_identity() +
  coord_flip() +
  labs(x="Pitch Type",
       y="",
       title="Proportion of Pitch Types assigned to Cluster 5") +
  ggthemes::theme_hc() +
  theme(strip.placement="outside",
        strip.background = element_blank())

Code
outliers |>
  filter(.cluster_pred=="Cluster 10") |>
  group_by(category,pitchType,outlier) |>
  count() |> group_by(outlier) |>
  mutate(pct=n/sum(n),
         pct=ifelse(outlier=="#212529",-1*pct,pct)) |>
  ggplot(aes(x=pitchType,y=pct,fill=outlier)) +
  geom_col() +
  facet_grid(category~., scales="free", switch="y", space="free_y") +
  scale_y_continuous(breaks = c(-1,-.5,0,.5,1),
                     limits = c(-1,1),
                     labels=c("100%","50%","0%","50%","100%")) +
  scale_fill_identity() +
  coord_flip() +
  labs(x="Pitch Type",
       y="",
       title="Proportion of Pitch Types assigned to Cluster 10") +
  ggthemes::theme_hc() +
  theme(strip.placement="outside",
        strip.background = element_blank())

Thank you



Christopher Teixeira

christopherteixeira.com



chris@christopherteixeira.com

in/christopherteixeira

ct-analytics