Data Science in a Box
Cornell College
DSC 223 - Fall 2022
October 11th, 2022
Two paintings very rich in composition, of a beautiful execution, and whose merit is very remarkable, each 17 inches 3 lines high, 23 inches wide; the first, painted on wood, comes from the Cabinet of Madame la Comtesse de Verrue; it represents a departure for the hunt: it shows in the front a child on a white horse, a man who gives the horn to gather the dogs, a falconer and other figures nicely distributed across the width of the painting; two horses drinking from a fountain; on the right in the corner a lovely country house topped by a terrace, on which people are at the table, others who play instruments; trees and fabriques pleasantly enrich the background.
Rows: 1
Columns: 61
$ name <chr> "R1777-89a"
$ sale <chr> "R1777"
$ lot <chr> "89"
$ position <dbl> 0.3755274
$ dealer <chr> "R"
$ year <dbl> 1777
$ origin_author <chr> "D/FL"
$ origin_cat <chr> "D/FL"
$ school_pntg <chr> "D/FL"
$ diff_origin <dbl> 0
$ logprice <dbl> 8.575462
$ price <dbl> 5300
$ count <dbl> 1
$ subject <chr> "D\x8epart pour la chasse"
$ authorstandard <chr> "Wouwerman, Philips"
$ artistliving <dbl> 0
$ authorstyle <chr> NA
$ author <chr> "Philippe Wouwermans"
$ winningbidder <chr> "Langlier, Jacques for Poullain, Antoine"
$ winningbiddertype <chr> "DC"
$ endbuyer <chr> "C"
$ Interm <dbl> 1
$ type_intermed <chr> "D"
$ Height_in <dbl> 17.25
$ Width_in <dbl> 23
$ Surface_Rect <dbl> 396.75
$ Diam_in <dbl> NA
$ Surface_Rnd <dbl> NA
$ Shape <chr> "squ_rect"
$ Surface <dbl> 396.75
$ material <chr> "bois"
$ mat <chr> "b"
$ materialCat <chr> "wood"
$ quantity <dbl> 1
$ nfigures <dbl> 0
$ engraved <dbl> 0
$ original <dbl> 0
$ prevcoll <dbl> 1
$ othartist <dbl> 0
$ paired <dbl> 1
$ figures <dbl> 0
$ finished <dbl> 0
$ lrgfont <dbl> 0
$ relig <dbl> 0
$ landsALL <dbl> 1
$ lands_sc <dbl> 0
$ lands_elem <dbl> 1
$ lands_figs <dbl> 1
$ lands_ment <dbl> 0
$ arch <dbl> 1
$ mytho <dbl> 0
$ peasant <dbl> 0
$ othgenre <dbl> 0
$ singlefig <dbl> 0
$ portrait <dbl> 0
$ still_life <dbl> 0
$ discauth <dbl> 0
$ history <dbl> 0
$ allegory <dbl> 0
$ pastorale <dbl> 0
$ other <dbl> 0
Rows: 1
Columns: 61
$ name <chr> "R1777-89a"
$ sale <chr> "R1777"
$ lot <chr> "89"
$ position <dbl> 0.3755274
$ dealer <chr> "R"
$ year <dbl> 1777
$ origin_author <chr> "D/FL"
$ origin_cat <chr> "D/FL"
$ school_pntg <chr> "D/FL"
$ diff_origin <dbl> 0
$ logprice <dbl> 8.575462
$ price <dbl> 5300
$ count <dbl> 1
$ subject <chr> "D\x8epart pour la chasse"
$ authorstandard <chr> "Wouwerman, Philips"
$ artistliving <dbl> 0
$ authorstyle <chr> NA
$ author <chr> "Philippe Wouwermans"
$ winningbidder <chr> "Langlier, Jacques for Poullain, Antoine"
$ winningbiddertype <chr> "DC"
$ endbuyer <chr> "C"
$ Interm <dbl> 1
$ type_intermed <chr> "D"
$ Height_in <dbl> 17.25
$ Width_in <dbl> 23
$ Surface_Rect <dbl> 396.75
$ Diam_in <dbl> NA
$ Surface_Rnd <dbl> NA
$ Shape <chr> "squ_rect"
$ Surface <dbl> 396.75
$ material <chr> "bois"
$ mat <chr> "b"
$ materialCat <chr> "wood"
$ quantity <dbl> 1
$ nfigures <dbl> 0
$ engraved <dbl> 0
$ original <dbl> 0
$ prevcoll <dbl> 1
$ othartist <dbl> 0
$ paired <dbl> 1
$ figures <dbl> 0
$ finished <dbl> 0
$ lrgfont <dbl> 0
$ relig <dbl> 0
$ landsALL <dbl> 1
$ lands_sc <dbl> 0
$ lands_elem <dbl> 1
$ lands_figs <dbl> 1
$ lands_ment <dbl> 0
$ arch <dbl> 1
$ mytho <dbl> 0
$ peasant <dbl> 0
$ othgenre <dbl> 0
$ singlefig <dbl> 0
$ portrait <dbl> 0
$ still_life <dbl> 0
$ discauth <dbl> 0
$ history <dbl> 0
$ allegory <dbl> 0
$ pastorale <dbl> 0
$ other <dbl> 0
Response variable: Variable whose behavior or variation you are trying to understand, on the y-axis
Explanatory variables: Other variables that you want to use to explain the variation in the response, on the x-axis
Predicted value: Output of the model function
Residuals: A measure of how far each case is from its predicted value (based on a particular model)
ht_wt_fit <- linear_reg() %>%
set_engine("lm") %>%
fit(Height_in ~ Width_in, data = pp)
ht_wt_fit_tidy <- tidy(ht_wt_fit$fit)
ht_wt_fit_aug <- augment(ht_wt_fit$fit) %>%
mutate(res_cat = ifelse(.resid > 0, TRUE, FALSE))
ggplot(data = ht_wt_fit_aug) +
geom_point(aes(x = Width_in, y = Height_in, color = res_cat)) +
geom_line(aes(x = Width_in, y = .fitted), size = 0.75, color = "#8E2C90") +
labs(
title = "Height vs. width of paintings",
subtitle = "Paris auctions, 1764 - 1780",
x = "Width (inches)",
y = "Height (inches)"
) +
guides(color = "none") +
scale_color_manual(values = c("#260b27", "#523178")) +
geom_text(aes(x = 0, y = 150), label = "Positive residual", color = "#e6b0e7", hjust = 0, size = 8) +
geom_text(aes(x = 150, y = 25), label = "Negative residual", color = "#260b27", hjust = 0, size = 8)
The plot below displays the relationship between height and width of paintings. The only difference from the previous plots is that it uses a smaller alpha value, making the points somewhat transparent. What feature is apparent in this plot that was not (as) apparent in the previous plots? What might be the reason for this feature?
How, if at all, does the relationship between width and height of paintings vary by whether or not they have any landscape elements?
ggplot(data = pp, aes(x = Width_in, y = Height_in, color = factor(landsALL))) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = "Height vs. width of paintings, by landscape features",
subtitle = "Paris auctions, 1764 - 1780",
x = "Width (inches)",
y = "Height (inches)",
color = "landscape"
) +
scale_color_manual(values = c("#E48957", "#071381"))
ggplot(data = pp, aes(x = Width_in, y = Height_in, color = factor(landsALL))) +
geom_point(alpha = 0.4) +
geom_smooth(method = "lm", se = FALSE,
fullrange = TRUE) + #<<
labs(
title = "Height vs. width of paintings, by landscape features",
subtitle = "Paris auctions, 1764 - 1780",
x = "Width (inches)",
y = "Height (inches)",
color = "landscape"
) +
scale_color_manual(values = c("#E48957", "#071381"))
is just as important as the model, if not more!
Statistics is the explanation of variation in the context of what remains unexplained.
\[\widehat{height}_{i} = \beta_0 + \beta_1 \times width_{i}\]
… using formula syntax
parsnip model object
Call:
stats::lm(formula = Height_in ~ Width_in, data = data)
Coefficients:
(Intercept) Width_in
3.6214 0.7808
\[\widehat{height}_{i} = 3.6214 + 0.7808 \times width_{i}\]
# A tibble: 2 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 3.62 0.254 14.3 8.82e-45
2 Width_in 0.781 0.00950 82.1 0
\[\widehat{height}_{i} = 3.62 + 0.781 \times width_{i}\]
\[\widehat{height}_{i} = 3.62 + 0.781 \times width_{i}\] - Slope: For each additional inch the painting is wider, the height is expected to be higher, on average, by 0.781 inches.
Remember this when interpreting model coefficients
\[\hat{y}_{i} = \beta_0 + \beta_1~x_{i}\] - Tough luck, you can’t have them…
\[\hat{y}_{i} = b_0 + b_1~x_{i}\]
\[\bar{y} = b_0 + b_1 \bar{x} ~ \rightarrow ~ b_0 = \bar{y} - b_1 \bar{x}\]
The slope has the same sign as the correlation coefficient: \(b_1 = r \frac{s_y}{s_x}\)
The sum of the residuals is zero: \(\sum_{i = 1}^n e_i = 0\)
The residuals and \(x\) values are uncorrelated
# A tibble: 3,393 × 3
name Height_in landsALL
<chr> <dbl> <dbl>
1 L1764-2 37 0
2 L1764-3 18 0
3 L1764-4 13 1
4 L1764-5a 14 1
5 L1764-5b 14 1
6 L1764-6 7 0
7 L1764-7a 6 0
8 L1764-7b 6 0
9 L1764-8 15 0
10 L1764-9a 9 0
11 L1764-9b 9 0
12 L1764-10a 16 1
13 L1764-10b 16 1
14 L1764-10c 16 1
15 L1764-11 20 0
16 L1764-12a 14 1
17 L1764-12b 14 1
18 L1764-13a 15 1
19 L1764-13b 15 1
20 L1764-14 37 0
# … with 3,373 more rows
landsALL = 0
: No landscape featureslandsALL = 1
: Some landscape features\[\widehat{Height_{in}} = 22.7 - 5.645~landsALL\]
landsALL = 0
) to the other level (landsALL = 1
)# A tibble: 7 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 14.0 10.0 1.40 0.162
2 school_pntgD/FL 2.33 10.0 0.232 0.816
3 school_pntgF 10.2 10.0 1.02 0.309
4 school_pntgG 1.65 11.9 0.139 0.889
5 school_pntgI 10.3 10.0 1.02 0.306
6 school_pntgS 30.4 11.4 2.68 0.00744
7 school_pntgX 2.87 10.3 0.279 0.780
# A tibble: 7 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 14.0 10.0 1.40 0.162
2 school_pntgD/FL 2.33 10.0 0.232 0.816
3 school_pntgF 10.2 10.0 1.02 0.309
4 school_pntgG 1.65 11.9 0.139 0.889
5 school_pntgI 10.3 10.0 1.02 0.306
6 school_pntgS 30.4 11.4 2.68 0.00744
7 school_pntgX 2.87 10.3 0.279 0.780
school_pntg | D_FL | F | G | I | S | X |
---|---|---|---|---|---|---|
A | 0 | 0 | 0 | 0 | 0 | 0 |
D/FL | 1 | 0 | 0 | 0 | 0 | 0 |
F | 0 | 1 | 0 | 0 | 0 | 0 |
G | 0 | 0 | 1 | 0 | 0 | 0 |
I | 0 | 0 | 0 | 1 | 0 | 0 |
S | 0 | 0 | 0 | 0 | 1 | 0 |
X | 0 | 0 | 0 | 0 | 0 | 1 |
# A tibble: 3,393 × 3
name Height_in school_pntg
<chr> <dbl> <chr>
1 L1764-2 37 F
2 L1764-3 18 I
3 L1764-4 13 D/FL
4 L1764-5a 14 F
5 L1764-5b 14 F
6 L1764-6 7 I
7 L1764-7a 6 F
8 L1764-7b 6 F
9 L1764-8 15 I
10 L1764-9a 9 D/FL
11 L1764-9b 9 D/FL
12 L1764-10a 16 X
13 L1764-10b 16 X
14 L1764-10c 16 X
15 L1764-11 20 D/FL
16 L1764-12a 14 D/FL
17 L1764-12b 14 D/FL
18 L1764-13a 15 D/FL
19 L1764-13b 15 D/FL
20 L1764-14 37 F
# … with 3,373 more rows
# A tibble: 7 × 5
term estimate std.error statistic p.value
<chr> <dbl> <dbl> <dbl> <dbl>
1 (Intercept) 14.0 10.0 1.40 0.162
2 school_pntgD/FL 2.33 10.0 0.232 0.816
3 school_pntgF 10.2 10.0 1.02 0.309
4 school_pntgG 1.65 11.9 0.139 0.889
5 school_pntgI 10.3 10.0 1.02 0.306
6 school_pntgS 30.4 11.4 2.68 0.00744
7 school_pntgX 2.87 10.3 0.279 0.780