::libraries(
easypackages# Data i/o
"here", # relative file path
"rio", # file import-export
# Data manipulation
"janitor", # data cleaning fns
"haven", # stata, sas, spss data io
"labelled", # var labelling
"readxl", # excel sheets
# "scales", # to change formats and units
"skimr", # quick data summary
"broom", # view model results
# Data analysis
"DHS.rates", # demographic rates for dhs-like surveys
"GeneralOaxaca", # BO decomposition for non-linear
"survey", # apply survey weights
# Analysis output
"gt",
# "modelsummary", # output summary tables
"gtsummary", # output summary tables
"flextable", # creating tables from objects
"officer", # editing in office docs
# R graph related packages
"ggstats",
"RColorBrewer",
# "scales",
"patchwork",
# Misc packages
"tidyverse", # Data manipulation iron man
"tictoc" # Code timing
)
BDDHS data pooling pre-checks
Getting started
Here we show the pre-requisite code sections. Run these at the outset to avoid errors. First we load the required packages.
Next we turn off scientific notations.
options(scipen = 999)
Next we set the default gtsummary print engine for tables.
theme_gtsummary_printer(print_engine = "flextable")
Now we set the flextable output defaults.
set_flextable_defaults(
font.size = 11,
text.align = "left",
big.mark = "",
background.color = "white",
table.layout = "autofit",
theme_fun = theme_vanilla
)
Document introduction
Here we document the variable codes and labels of variables across all the Bangladesh Demographic and Health Survey (DHS) datasets. We check the variable labels and codes before running the pooling code in “daprep-v01_bddhs.R”. We pool the following Bangladesh DHS surveys:
# Creating the table of surveys to be used for pooling
|>
bdbr1_tmp_intro mutate(n_births = prettyNum(n_births, big.mark = ",")) |>
select(c(ctr_name, svy_year, n_births)) |>
# Join vars from bdir_tmp_intro
left_join(
|>
bdir1_tmp_intro mutate(n_women = prettyNum(n_women, big.mark = ",")) |>
select(c(year, n_women)),
by = join_by(svy_year == year)
|>
) # Join vars from bdhr_tmp_intro
left_join(
|>
bdhr1_tmp_intro mutate(n_households = prettyNum(n_households, big.mark = ",")) |>
select(svy_year, n_households),
by = join_by(svy_year)
|>
) # Join vars from bdpr_tmp_intro
left_join(
|>
bdpr1_tmp_intro mutate(n_persons = prettyNum(n_persons, big.mark = ",")) |>
select(svy_year, n_persons),
by = join_by(svy_year)
|>
) # convert nested tibble to simple tibble
unnest(cols = c()) |>
mutate(
ccode = row_number(),
.before = ctr_name
|>
) # convert to flextable object
qflextable() |>
align(align = "left", part = "all") |>
autofit()
ccode | ctr_name | svy_year | n_births | n_women | n_households | n_persons |
---|---|---|---|---|---|---|
1 | Bangladesh | 1993 | 32,581 | 9,493 | 9,174 | 51,631 |
2 | Bangladesh | 1996 | 29,344 | 8,981 | 8,682 | 47,432 |
3 | Bangladesh | 1999 | 31,906 | 10,373 | 9,854 | 54,627 |
4 | Bangladesh | 2004 | 33,597 | 11,300 | 10,500 | 55,883 |
5 | Bangladesh | 2007 | 30,527 | 10,996 | 10,400 | 53,413 |
6 | Bangladesh | 2011 | 45,833 | 17,749 | 17,141 | 83,731 |
7 | Bangladesh | 2014 | 43,772 | 17,863 | 17,300 | 81,624 |
8 | Bangladesh | 2017 | 47,828 | 20,127 | 19,457 | 89,819 |
9 | Bangladesh | 2022 | 64,722 | 30,078 | 30,018 | 132,463 |
We use the following variables for the pooled data analysis:
- Dependent variable
- infantd = Index child died during infancy period (0-11 months)
- Main Independent variable
- sibsurv_nmv = Survival status of preceding child (Death scarring)
- binterval_3c_nmv_opp = Birth interval preceding to index child
- Independent variables [CHILD LEVEL]
- cyob10y_opp = Birth cohort of index child
- bord_c = Birth order of index child
- sex_fm = Gender of index child
- season = Season during birth
- Independent variables [MOTHER/PARENT LEVEL]
myob_opp = Birth cohort of mother- macb_c_opp = Mother’s age during birth of index child
- medu_opp = Mother’s Level of education
- fedu_opp = Father’s level of education
- Independent variables [HOUSEHOLD LEVEL]
- religion = Religion
- nat_lang = Native language of respondent
- wi_qt_opp = Household wealth quintile
hhgen_2c_opp = Generations in household- hhstruc_opp = Household structure
- head_sex_fm = Sex of HH head
- Independent variables [COMMUNITY LEVEL]
- por = Place of residence of the household
- ecoreg = Ecological region
Note: (a) Crossed names indicates variable not included.
Data import
We will directly import the nested tibble here. The code for dataset preparation is in the “daprep-v01_bddhs.R” script file.
# Here we import the tibbles to be used for dataset checking
# Import the bdbr nested tibble
<- read_rds(file = here("website_data", "bdbr1_nest0.rds"))
bdbr1_pre_tmp0 # Import the bdhr nested tibble
<- read_rds(file = here("website_data", "bdhr1_nest0.rds"))
bdhr1_pre_tmp0 # Import the bdpr nested tibble
<- read_rds(file = here("website_data", "bdpr1_nest0.rds")) bdpr1_pre_tmp0
Bangladesh BR dataset use for variable creation
Checking the Women’s weight variable before harmonization
We will check the formatting of the v005 women’s weight variable before creating the pooled survey weight. For this we will use the labelled::look_for().
# First we create the data dictionary of v005 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v005 = map(
bdbr_data,
\(df) {|>
df select(v005) |>
look_for(details = "full") |>
# For correctly viewing the range column in data dictionary
convert_list_columns_to_character() |>
select(-c(levels:n_na))
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_v005)) |>
select(-pos)
# Convert and view the tibble as flextable
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | svy_year | variable | label | col_type | missing | unique_values | range |
---|---|---|---|---|---|---|---|
Bangladesh | 1993 | v005 | sample weight | dbl | 0 | 14 | 292519 - 1327458 |
Bangladesh | 1996 | v005 | sample weight | dbl | 0 | 16 | 191059 - 1453427 |
Bangladesh | 1999 | v005 | sample weight | dbl | 0 | 12 | 250565 - 1494863 |
Bangladesh | 2004 | v005 | sample weight | dbl | 0 | 350 | 55728 - 2707592 |
Bangladesh | 2007 | v005 | sample weight | dbl | 0 | 359 | 135650 - 3592687 |
Bangladesh | 2011 | v005 | women's individual sample weight (6 decimals) | dbl | 0 | 465 | 184316 - 3381150 |
Bangladesh | 2014 | v005 | women's individual sample weight (6 decimals) | dbl | 0 | 454 | 153594 - 14194492 |
Bangladesh | 2017 | v005 | women's individual sample weight (6 decimals) | dbl | 0 | 661 | 158603 - 2465775 |
Bangladesh | 2022 | v005 | women's individual sample weight (6 decimals) | dbl | 0 | 660 | 91325 - 3891736 |
The women’s weight variables are in numeric class and have no missing values. Therefore, we need not reformat them. Hence we directly use it for preparing the pooled survey weight. NOTE that, the women’s weight for the Bangladesh 1993, 1996 and 1999 rounds have few unique values. This could be because there might have been fewer sampling units in the secondary stage.
Checking the ID variables before harmonization
Here we check the formatting of the variables using which we will prepare the ID variables for the pooled Bangladesh birth history recode (br) dataset. We will use the following constituent variables for creating the ID variables for the pooled dataset:
# We check the var type of ID vars in all bdbr datasets.
# First we create a data dictionary of the bdbr datasets in nested tibble.
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_idvars = map(
bdbr_data,
\(df) {|>
df select(v001, v002, v003, bord, v021, v022, v023, v024) |>
lookfor(details = "full") |>
select(-c(levels:n_na)) |>
# For correctly viewing the range column in data dictionary
convert_list_columns_to_character()
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and output the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_idvars)) |>
arrange(pos)
# Convert and view the tibble as flextable
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | svy_year | pos | variable | label | col_type | missing | unique_values | range |
---|---|---|---|---|---|---|---|---|
Bangladesh | 1993 | 1 | v001 | cluster number | dbl | 0 | 301 | 101 - 573 |
Bangladesh | 1996 | 1 | v001 | cluster number | dbl | 0 | 313 | 101 - 630 |
Bangladesh | 1999 | 1 | v001 | cluster number | dbl | 0 | 341 | 3 - 500 |
Bangladesh | 2004 | 1 | v001 | cluster number | dbl | 0 | 361 | 1 - 550 |
Bangladesh | 2007 | 1 | v001 | cluster number | dbl | 0 | 361 | 1 - 361 |
Bangladesh | 2011 | 1 | v001 | cluster number | dbl | 0 | 600 | 1 - 600 |
Bangladesh | 2014 | 1 | v001 | cluster number | dbl | 0 | 600 | 1 - 600 |
Bangladesh | 2017 | 1 | v001 | cluster number | dbl | 0 | 672 | 1 - 675 |
Bangladesh | 2022 | 1 | v001 | cluster number | dbl | 0 | 674 | 1 - 675 |
Bangladesh | 1993 | 2 | v002 | household number | dbl | 0 | 529 | 1 - 615 |
Bangladesh | 1996 | 2 | v002 | household number | dbl | 0 | 537 | 1 - 638 |
Bangladesh | 1999 | 2 | v002 | household number | dbl | 0 | 489 | 1 - 544 |
Bangladesh | 2004 | 2 | v002 | household number | dbl | 0 | 235 | 1 - 280 |
Bangladesh | 2007 | 2 | v002 | household number | dbl | 0 | 194 | 1 - 252 |
Bangladesh | 2011 | 2 | v002 | household number | dbl | 0 | 178 | 1 - 217 |
Bangladesh | 2014 | 2 | v002 | household number | dbl | 0 | 203 | 1 - 222 |
Bangladesh | 2017 | 2 | v002 | household number | dbl | 0 | 250 | 1 - 299 |
Bangladesh | 2022 | 2 | v002 | household number | dbl | 0 | 203 | 1 - 220 |
Bangladesh | 1993 | 3 | v003 | respondent's line number | dbl | 0 | 22 | 1 - 26 |
Bangladesh | 1996 | 3 | v003 | respondent's line number | dbl | 0 | 23 | 1 - 28 |
Bangladesh | 1999 | 3 | v003 | respondent's line number | dbl | 0 | 19 | 1 - 19 |
Bangladesh | 2004 | 3 | v003 | respondent's line number | dbl | 0 | 24 | 1 - 30 |
Bangladesh | 2007 | 3 | v003 | respondent's line number | dbl | 0 | 22 | 1 - 26 |
Bangladesh | 2011 | 3 | v003 | respondent's line number | dbl | 0 | 21 | 1 - 23 |
Bangladesh | 2014 | 3 | v003 | respondent's line number | dbl | 0 | 19 | 1 - 22 |
Bangladesh | 2017 | 3 | v003 | respondent's line number | dbl | 0 | 21 | 1 - 27 |
Bangladesh | 2022 | 3 | v003 | respondent's line number | dbl | 0 | 19 | 1 - 23 |
Bangladesh | 1993 | 4 | bord | birth order number | dbl | 0 | 15 | 1 - 15 |
Bangladesh | 1996 | 4 | bord | birth order number | dbl | 0 | 15 | 1 - 15 |
Bangladesh | 1999 | 4 | bord | birth order number | dbl | 0 | 16 | 1 - 16 |
Bangladesh | 2004 | 4 | bord | birth order number | dbl | 0 | 15 | 1 - 15 |
Bangladesh | 2007 | 4 | bord | birth order number | dbl | 0 | 14 | 1 - 14 |
Bangladesh | 2011 | 4 | bord | birth order number | dbl | 0 | 20 | 1 - 20 |
Bangladesh | 2014 | 4 | bord | birth order number | dbl | 0 | 15 | 1 - 15 |
Bangladesh | 2017 | 4 | bord | birth order number | dbl | 0 | 13 | 1 - 13 |
Bangladesh | 2022 | 4 | bord | birth order number | dbl | 0 | 11 | 1 - 11 |
Bangladesh | 1993 | 5 | v021 | primary sampling unit | dbl | 0 | 301 | 101 - 573 |
Bangladesh | 1996 | 5 | v021 | primary sampling unit | dbl | 0 | 313 | 101 - 630 |
Bangladesh | 1999 | 5 | v021 | primary sampling unit | dbl | 0 | 341 | 3 - 500 |
Bangladesh | 2004 | 5 | v021 | primary sampling unit | dbl | 0 | 361 | 1 - 550 |
Bangladesh | 2007 | 5 | v021 | primary sampling unit | dbl | 0 | 361 | 1 - 361 |
Bangladesh | 2011 | 5 | v021 | primary sampling unit | dbl | 0 | 600 | 1 - 600 |
Bangladesh | 2014 | 5 | v021 | primary sampling unit | dbl | 0 | 600 | 1 - 600 |
Bangladesh | 2017 | 5 | v021 | primary sampling unit | dbl | 0 | 672 | 1 - 675 |
Bangladesh | 2022 | 5 | v021 | primary sampling unit | dbl | 0 | 674 | 1 - 675 |
Bangladesh | 1993 | 6 | v022 | sample stratum number | dbl | 0 | 148 | 1 - 148 |
Bangladesh | 1996 | 6 | v022 | sample stratum number | dbl | 0 | 152 | 1 - 154 |
Bangladesh | 1999 | 6 | v022 | sample stratum number | dbl | 0 | 168 | 1 - 168 |
Bangladesh | 2004 | 6 | v022 | sample stratum number | dbl | 0 | 177 | 1 - 177 |
Bangladesh | 2007 | 6 | v022 | sample stratum number | dbl | 0 | 179 | 1 - 179 |
Bangladesh | 2011 | 6 | v022 | sample strata for sampling errors | dbl | 0 | 20 | 1 - 20 |
Bangladesh | 2014 | 6 | v022 | sample strata for sampling errors | dbl+lbl | 0 | 20 | 1 - 21 |
Bangladesh | 2017 | 6 | v022 | sample strata for sampling errors | dbl+lbl | 0 | 22 | 1 - 22 |
Bangladesh | 2022 | 6 | v022 | sample strata for sampling errors | dbl+lbl | 0 | 16 | 1 - 16 |
Bangladesh | 1993 | 7 | v023 | sample domain | dbl+lbl | 0 | 14 | 2 - 15 |
Bangladesh | 1996 | 7 | v023 | sample domain | dbl+lbl | 0 | 16 | 1 - 16 |
Bangladesh | 1999 | 7 | v023 | sample domain | dbl+lbl | 0 | 12 | 1 - 12 |
Bangladesh | 2004 | 7 | v023 | sample domain | dbl+lbl | 0 | 1 | 0 - 0 |
Bangladesh | 2007 | 7 | v023 | sample domain | dbl | 0 | 22 | 1 - 22 |
Bangladesh | 2011 | 7 | v023 | stratification used in sample design | dbl+lbl | 0 | 20 | 1 - 20 |
Bangladesh | 2014 | 7 | v023 | stratification used in sample design | dbl+lbl | 0 | 21 | 1 - 21 |
Bangladesh | 2017 | 7 | v023 | stratification used in sample design | dbl+lbl | 0 | 22 | 1 - 22 |
Bangladesh | 2022 | 7 | v023 | stratification used in sample design | dbl+lbl | 0 | 16 | 1 - 16 |
Bangladesh | 1993 | 8 | v024 | region | dbl+lbl | 0 | 5 | 1 - 5 |
Bangladesh | 1996 | 8 | v024 | division | dbl+lbl | 0 | 6 | 1 - 6 |
Bangladesh | 1999 | 8 | v024 | division | dbl+lbl | 0 | 6 | 1 - 6 |
Bangladesh | 2004 | 8 | v024 | region | dbl+lbl | 0 | 6 | 1 - 6 |
Bangladesh | 2007 | 8 | v024 | division | dbl+lbl | 0 | 6 | 1 - 6 |
Bangladesh | 2011 | 8 | v024 | region | dbl+lbl | 0 | 7 | 1 - 7 |
Bangladesh | 2014 | 8 | v024 | division | dbl+lbl | 0 | 7 | 1 - 7 |
Bangladesh | 2017 | 8 | v024 | division | dbl+lbl | 0 | 8 | 1 - 8 |
Bangladesh | 2022 | 8 | v024 | division | dbl+lbl | 0 | 8 | 1 - 8 |
From the above we can see that v023 and v024 are of labelled class, while the rest are in numeric class. Therefore, we will have to check the numeric and labelled variables in different ways. Note that although survey year is a constituent ID variable we have not checked it. It is imperative that survey year would be a 4-digit variable.
Numeric ID variables check
First, let’s find out the required length of the numeric ID variables by checking the maximum values of the constituent ID variable across the Bangladesh DHS datasets. Here we estimate the summary stats of numeric constituent variables using skim_without_charts().
# Check the summary stats for ID vars using skimr in each bdbr dataset.
# First we estimate the summary stats using skim_without_charts().
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(
skim_id_num = map(
bdbr_data,function(df) {
|>
df select(v001, v002, v003, bord, v021, v022) |>
skim_without_charts() |>
as_tibble() |>
select(-c(skim_type, n_missing, complete_rate)) |>
rename(
variable = 1,
mean = 2,
sd = 3,
min = 4,
p25 = 5,
p50 = 6,
p75 = 7,
max = 8
)
}
)
) bdbr1_pre_tmp1
Next, we check the summary stats of numeric variables by variable name-wise.
# Now we unnest the nested tibble so that we can compare the variable length
# across the bdbr datasets.
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(skim_id_num)) |>
arrange(variable, svy_year) |>
# change the decimal places of selected variables
mutate(
mean = sprintf("%.1f", mean),
sd = sprintf("%.1f", sd),
p75 = sprintf("%.0f", p75)
)# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | svy_year | variable | mean | sd | min | p25 | p50 | p75 | max |
---|---|---|---|---|---|---|---|---|---|
Bangladesh | 1993 | bord | 3.2 | 2.2 | 1 | 1 | 3 | 4 | 15 |
Bangladesh | 1996 | bord | 3.1 | 2.1 | 1 | 1 | 3 | 4 | 15 |
Bangladesh | 1999 | bord | 2.9 | 2.0 | 1 | 1 | 2 | 4 | 16 |
Bangladesh | 2004 | bord | 2.8 | 1.9 | 1 | 1 | 2 | 4 | 15 |
Bangladesh | 2007 | bord | 2.7 | 1.8 | 1 | 1 | 2 | 4 | 14 |
Bangladesh | 2011 | bord | 2.5 | 1.6 | 1 | 1 | 2 | 3 | 20 |
Bangladesh | 2014 | bord | 2.4 | 1.5 | 1 | 1 | 2 | 3 | 15 |
Bangladesh | 2017 | bord | 2.2 | 1.4 | 1 | 1 | 2 | 3 | 13 |
Bangladesh | 2022 | bord | 2.0 | 1.2 | 1 | 1 | 2 | 3 | 11 |
Bangladesh | 1993 | v001 | 356.6 | 137.1 | 101 | 247 | 361 | 501 | 573 |
Bangladesh | 1996 | v001 | 392.2 | 151.6 | 101 | 266 | 384 | 533 | 630 |
Bangladesh | 1999 | v001 | 264.0 | 141.3 | 3 | 149 | 270 | 392 | 500 |
Bangladesh | 2004 | v001 | 212.1 | 160.2 | 1 | 81 | 175 | 325 | 550 |
Bangladesh | 2007 | v001 | 181.8 | 107.6 | 1 | 87 | 177 | 279 | 361 |
Bangladesh | 2011 | v001 | 302.7 | 178.0 | 1 | 143 | 299 | 461 | 600 |
Bangladesh | 2014 | v001 | 301.1 | 180.8 | 1 | 136 | 299 | 466 | 600 |
Bangladesh | 2017 | v001 | 338.1 | 202.7 | 1 | 152 | 338 | 522 | 675 |
Bangladesh | 2022 | v001 | 338.9 | 200.1 | 1 | 158 | 338 | 517 | 675 |
Bangladesh | 1993 | v002 | 145.5 | 115.7 | 1 | 53 | 117 | 214 | 615 |
Bangladesh | 1996 | v002 | 149.4 | 118.3 | 1 | 54 | 119 | 221 | 638 |
Bangladesh | 1999 | v002 | 154.3 | 110.3 | 1 | 61 | 135 | 229 | 544 |
Bangladesh | 2004 | v002 | 54.4 | 39.6 | 1 | 23 | 48 | 78 | 280 |
Bangladesh | 2007 | v002 | 53.7 | 36.4 | 1 | 24 | 49 | 78 | 252 |
Bangladesh | 2011 | v002 | 58.8 | 35.9 | 1 | 28 | 57 | 87 | 217 |
Bangladesh | 2014 | v002 | 58.3 | 38.0 | 1 | 27 | 54 | 84 | 222 |
Bangladesh | 2017 | v002 | 64.4 | 42.1 | 1 | 30 | 60 | 93 | 299 |
Bangladesh | 2022 | v002 | 59.7 | 38.2 | 1 | 28 | 56 | 86 | 220 |
Bangladesh | 1993 | v003 | 2.4 | 1.6 | 1 | 2 | 2 | 2 | 26 |
Bangladesh | 1996 | v003 | 2.4 | 1.6 | 1 | 2 | 2 | 2 | 28 |
Bangladesh | 1999 | v003 | 2.5 | 1.7 | 1 | 2 | 2 | 2 | 19 |
Bangladesh | 2004 | v003 | 2.4 | 1.7 | 1 | 2 | 2 | 2 | 30 |
Bangladesh | 2007 | v003 | 2.4 | 1.6 | 1 | 2 | 2 | 2 | 26 |
Bangladesh | 2011 | v003 | 2.4 | 1.5 | 1 | 2 | 2 | 2 | 23 |
Bangladesh | 2014 | v003 | 2.3 | 1.4 | 1 | 2 | 2 | 2 | 22 |
Bangladesh | 2017 | v003 | 2.3 | 1.5 | 1 | 2 | 2 | 2 | 27 |
Bangladesh | 2022 | v003 | 2.3 | 1.3 | 1 | 2 | 2 | 2 | 23 |
Bangladesh | 1993 | v021 | 356.6 | 137.1 | 101 | 247 | 361 | 501 | 573 |
Bangladesh | 1996 | v021 | 392.2 | 151.6 | 101 | 266 | 384 | 533 | 630 |
Bangladesh | 1999 | v021 | 264.0 | 141.3 | 3 | 149 | 270 | 392 | 500 |
Bangladesh | 2004 | v021 | 212.1 | 160.2 | 1 | 81 | 175 | 325 | 550 |
Bangladesh | 2007 | v021 | 181.8 | 107.6 | 1 | 87 | 177 | 279 | 361 |
Bangladesh | 2011 | v021 | 302.7 | 178.0 | 1 | 143 | 299 | 461 | 600 |
Bangladesh | 2014 | v021 | 301.1 | 180.8 | 1 | 136 | 299 | 466 | 600 |
Bangladesh | 2017 | v021 | 338.1 | 202.7 | 1 | 152 | 338 | 522 | 675 |
Bangladesh | 2022 | v021 | 338.9 | 200.1 | 1 | 158 | 338 | 517 | 675 |
Bangladesh | 1993 | v022 | 76.1 | 43.8 | 1 | 37 | 78 | 113 | 148 |
Bangladesh | 1996 | v022 | 80.1 | 45.3 | 1 | 38 | 81 | 120 | 154 |
Bangladesh | 1999 | v022 | 87.6 | 49.1 | 1 | 44 | 90 | 131 | 168 |
Bangladesh | 2004 | v022 | 86.3 | 51.7 | 1 | 40 | 87 | 131 | 177 |
Bangladesh | 2007 | v022 | 90.6 | 53.6 | 1 | 43 | 88 | 139 | 179 |
Bangladesh | 2011 | v022 | 14.1 | 4.9 | 1 | 11 | 15 | 18 | 20 |
Bangladesh | 2014 | v022 | 15.1 | 5.2 | 1 | 12 | 16 | 19 | 21 |
Bangladesh | 2017 | v022 | 12.2 | 6.2 | 1 | 6 | 12 | 18 | 22 |
Bangladesh | 2022 | v022 | 8.5 | 4.6 | 1 | 4 | 8 | 12 | 16 |
Now we find out the required length of the numeric ID variables to be set, so that we can correctly concatenate them to create the ID variables. The required length of the numeric ID variables are given in max_digits column. Note that survey year is also a constituent ID variable of 4-digits.
# Processing the above nested tibble further
<- bdbr1_pre_tmp2 |>
bdbr1_pre_tmp3 group_by(variable) |>
# find the minimum and maximum values across surveys
summarize(
min_val = min(min),
max_val = max(max)
|>
) mutate(
# calculate the num of digits in the maximum values
max_digits = nchar(as.character(max_val)),
# convert char var to factor
variable = fct(
variable, levels = c("v001", "v002", "v003", "bord", "v021", "v022")
)|>
) # sort the rows by factor levels
arrange(variable) |>
# add variable labels and relocate it after variable name.
bind_cols(vlabel = c("cluster number", "household number",
"respondent's line number", "birth order",
"primary sampling unit", "sample strata for se")) |>
relocate(vlabel, .after = 1)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp3 qflextable() |>
align(align = "left", part = "all") |>
autofit()
variable | vlabel | min_val | max_val | max_digits |
---|---|---|---|---|
v001 | cluster number | 1 | 675 | 3 |
v002 | household number | 1 | 638 | 3 |
v003 | respondent's line number | 1 | 30 | 2 |
bord | birth order | 1 | 20 | 2 |
v021 | primary sampling unit | 1 | 675 | 3 |
v022 | sample strata for se | 1 | 179 | 3 |
Labelled ID variables check
First we check the labels in sub-national region variable coded as v024 across the bdbr datasets. Let’s create a nested tibble of v024’s value labels.
# Create the data dictionary for v024 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v024 = map(
bdbr_data,
\(df) {|>
df select(v024) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
Now we view the value labels of v024 in the table below.
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_v024)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "v024", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | v024 | 1 | [1] barishal | [1] barisal | [1] barisal | [1] barisal | [1] barisal | [1] barisal | [1] barisal | [1] barisal | [1] barishal |
Bangladesh | v024 | 2 | [2] chittagong | [2] chittagong | [2] chittagong | [2] chittagong | [2] chittagong | [2] chittagong | [2] chittagong | [2] chittagong | [2] chattogram |
Bangladesh | v024 | 3 | [3] dhaka | [3] dhaka | [3] dhaka | [3] dhaka | [3] dhaka | [3] dhaka | [3] dhaka | [3] dhaka | [3] dhaka |
Bangladesh | v024 | 4 | [4] khulna | [4] khulna | [4] khulna | [4] khulna | [4] khulna | [4] khulna | [4] khulna | [4] khulna | [4] khulna |
Bangladesh | v024 | 5 | [5] rajshani | [5] rajashahi | [5] rajashahi | [5] rajshahi | [5] rajshahi | [5] rajshahi | [5] rajshahi | [5] mymensingh | [5] mymensingh |
Bangladesh | v024 | 6 | [6] sylhet | [6] sylhet | [6] sylhet | [6] sylhet | [6] rangpur | [6] rangpur | [6] rajshahi | [6] rajshahi | |
Bangladesh | v024 | 7 | [7] sylhet | [7] sylhet | [7] rangpur | [7] rangpur | |||||
Bangladesh | v024 | 8 | [8] sylhet | [8] sylhet |
NOTE: The sub-national region variable, v024 denote the same variable concept across all bdbr rounds. The number of value labels increases across the survey years denoting creation of new sub-national regions. In 1999 there were 5 regions, which increased to 6 for bdbr 1996, 1999, 2004 and 2007, then it increased 7 for bdbr 2011 and 2014, finally there were 8 regions in 2017 and 2022.
VERD: In this analysis, we do not use the region var in the ID var.
Secondly, we check the labels in v023 variable that denotes the stratifications used for sampling design. First we create a nested tibble of v023’s value labels.
# Create the data dictionary for v023 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v023 = map(
bdbr_data,
\(df) {|>
df select(v023) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
Now we view the value labels of v023 in the table below.
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_v023)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "v023", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | v023 | 0 | [0] national | [0] national | |||||||
Bangladesh | v023 | 1 | [1] country specific | [1] barisal - municip. | [1] barisal : urban | [1] country specific | [1] barisal city corp. | [1] barisal city corp. | [1] barisal - city corporation | [1] barishal urban | |
Bangladesh | v023 | 2 | [2] barisal - rural | [2] barisal : rural | [2] chittagong city corp. | [2] chittagong city corp. | [2] barisal - other urban | [2] barishal rural | |||
Bangladesh | v023 | 3 | [3] chittagong - sma | [3] chittagong : urban | [3] dhaka city corp. | [3] dhaka city corp. | [3] barisal - rural | [3] chattogram urban | |||
Bangladesh | v023 | 4 | [4] chittagong - municip | [4] chittagong : rural | [4] khulna city corp. | [4] khulna city corp. | [4] chittagong - city corporation | [4] chattogram rural | |||
Bangladesh | v023 | 5 | [5] chittagong - rural | [5] dhaka : urban | [5] rajshahi city corp. | [5] rajshahi city corp. | [5] chittagong - other urban | [5] dhaka urban | |||
Bangladesh | v023 | 6 | [6] dhaka - sma | [6] dhaka : rural | [6] sylhet city corp. | [6] rangpur city corp. | [6] chittagong - rural | [6] dhaka rural | |||
Bangladesh | v023 | 7 | [7] dhaka - municip. | [7] khulna : urban | [7] barisal other urban | [7] sylhet city corp. | [7] dhaka - city corporation | [7] khulna urban | |||
Bangladesh | v023 | 8 | [8] dhaka - rural | [8] khulna : rural | [8] chittagong other urban | [8] barisal other urban | [8] dhaka - other urban | [8] khulna rural | |||
Bangladesh | v023 | 9 | [9] khulna - sma | [9] rajashahi : urban | [9] dhaka other urban | [9] chittagong other urban | [9] dhaka - rural | [9] mymensingh urban | |||
Bangladesh | v023 | 10 | [10] khulna - municip. | [10] rajashahi : rural | [10] khulna other urban | [10] dhaka other urban | [10] khulna - city corporation | [10] mymensingh rural | |||
Bangladesh | v023 | 11 | [11] khulna - rural | [11] sylhet : urban | [11] rajshahi other urban | [11] khulna other urban | [11] khulna - other urban | [11] rajshahi urban | |||
Bangladesh | v023 | 12 | [12] rajshahi - sma | [12] sylhet : rural | [12] rangpur other urban | [12] rajshahi other urban | [12] khulna - rural | [12] rajshahi rural | |||
Bangladesh | v023 | 13 | [13] rajshahi - municip. | [13] sylhet other urban | [13] rangpur other urban | [13] mymensingh - other urban | [13] rangpur urban | ||||
Bangladesh | v023 | 14 | [14] rajshahi - rural | [14] barisal rural | [14] sylhet other urban | [14] mymensingh - rural | [14] rangpur rural | ||||
Bangladesh | v023 | 15 | [15] sylhet - municip. | [15] chittagong rural | [15] barisal rural | [15] rajshahi - city corporation | [15] sylhet urban | ||||
Bangladesh | v023 | 16 | [16] sylhet - rural | [16] dhaka rural | [16] chittagong rural | [16] rajshahi - other urban | [16] sylhet rural | ||||
Bangladesh | v023 | 17 | [17] khulna rural | [17] dhaka rural | [17] rajshahi - rural | ||||||
Bangladesh | v023 | 18 | [18] rajshahi rural | [18] khulna rural | [18] rangpur - other urban | ||||||
Bangladesh | v023 | 19 | [19] rangpur rural | [19] rajshahi rural | [19] rangpur - rural | ||||||
Bangladesh | v023 | 20 | [20] sylhet rural | [20] rangpur rural | [20] sylhet - city corporation | ||||||
Bangladesh | v023 | 21 | [21] sylhet rural | [21] sylhet - other urban | |||||||
Bangladesh | v023 | 22 | [22] sylhet - rural | ||||||||
Bangladesh | v023 |
NOTE: The labels of v023 are different across the survey rounds.
VERD: Therefore we cannot use v023 in the ID variable preparation.
Checking the Birth History variables before harmonization
Undoubtedly the birth history variables are important for this study objective. Therefore, we need to scrutinize all the birth history variables before using them to prepare harmonized variables for the pooled dataset.
# We check the birth history vars in all bdbr datasets.
# First we create a data dictionary in nested tibble.
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_bhvars = map(
bdbr_data,
\(df) {|>
df select(bidx, matches("^b[0-9]+")) |>
lookfor(details = "full") |>
select(-c(levels:n_na)) |>
# For correctly viewing the range column in data dictionary
convert_list_columns_to_character()
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_bhvars)) |>
arrange(pos)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | svy_year | pos | variable | label | col_type | missing | unique_values | range |
---|---|---|---|---|---|---|---|---|
Bangladesh | 1993 | 1 | bidx | birth column number | dbl | 0 | 15 | 1 - 15 |
Bangladesh | 1996 | 1 | bidx | birth column number | dbl | 0 | 15 | 1 - 15 |
Bangladesh | 1999 | 1 | bidx | birth column number | dbl | 0 | 16 | 1 - 16 |
Bangladesh | 2004 | 1 | bidx | birth column number | dbl | 0 | 15 | 1 - 15 |
Bangladesh | 2007 | 1 | bidx | birth column number | dbl | 0 | 14 | 1 - 14 |
Bangladesh | 2011 | 1 | bidx | birth column number | dbl | 0 | 20 | 1 - 20 |
Bangladesh | 2014 | 1 | bidx | birth column number | dbl | 0 | 15 | 1 - 15 |
Bangladesh | 2017 | 1 | bidx | birth column number | dbl | 0 | 13 | 1 - 13 |
Bangladesh | 2022 | 1 | bidx | birth column number | dbl | 0 | 11 | 1 - 11 |
Bangladesh | 1993 | 2 | b0 | child is twin | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 1996 | 2 | b0 | child is twin | dbl+lbl | 0 | 3 | 0 - 2 |
Bangladesh | 1999 | 2 | b0 | child is twin | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2004 | 2 | b0 | child is twin | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2007 | 2 | b0 | child is twin | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2011 | 2 | b0 | child is twin | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2014 | 2 | b0 | child is twin | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2017 | 2 | b0 | child is twin | dbl+lbl | 0 | 5 | 0 - 4 |
Bangladesh | 2022 | 2 | b0 | child is twin | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 1993 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 1996 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 1999 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 2004 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 2007 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 2011 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 2014 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 2017 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 2022 | 3 | b1 | month of birth | dbl | 0 | 12 | 1 - 12 |
Bangladesh | 1993 | 4 | b2 | year of birth | dbl | 0 | 38 | 57 - 94 |
Bangladesh | 1996 | 4 | b2 | year of birth | dbl | 0 | 38 | 60 - 97 |
Bangladesh | 1999 | 4 | b2 | year of birth | dbl+lbl | 0 | 38 | 0 - 99 |
Bangladesh | 2004 | 4 | b2 | year of birth | dbl | 0 | 39 | 1966 - 2004 |
Bangladesh | 2007 | 4 | b2 | year of birth | dbl | 0 | 38 | 1970 - 2007 |
Bangladesh | 2011 | 4 | b2 | year of birth | dbl | 0 | 38 | 1974 - 2011 |
Bangladesh | 2014 | 4 | b2 | year of birth | dbl | 0 | 37 | 1978 - 2014 |
Bangladesh | 2017 | 4 | b2 | year of birth | dbl | 0 | 38 | 1981 - 2018 |
Bangladesh | 2022 | 4 | b2 | year of birth | dbl | 0 | 39 | 1984 - 2022 |
Bangladesh | 1993 | 5 | b3 | date of birth (cmc) | dbl | 0 | 430 | 694 - 1131 |
Bangladesh | 1996 | 5 | b3 | date of birth (cmc) | dbl | 0 | 429 | 729 - 1166 |
Bangladesh | 1999 | 5 | b3 | date of birth (cmc) | dbl | 0 | 434 | 763 - 1203 |
Bangladesh | 2004 | 5 | b3 | date of birth (cmc) | dbl | 0 | 437 | 797 - 1253 |
Bangladesh | 2007 | 5 | b3 | date of birth (cmc) | dbl | 0 | 436 | 850 - 1291 |
Bangladesh | 2011 | 5 | b3 | date of birth (cmc) | dbl | 0 | 443 | 893 - 1344 |
Bangladesh | 2014 | 5 | b3 | date of birth (cmc) | dbl | 0 | 438 | 938 - 1378 |
Bangladesh | 2017 | 5 | b3 | date of birth (cmc) | dbl | 0 | 436 | 984 - 1419 |
Bangladesh | 2022 | 5 | b3 | date of birth (cmc) | dbl | 0 | 447 | 1010 - 1476 |
Bangladesh | 1993 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1996 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1999 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2004 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2007 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2011 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2014 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2017 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2022 | 6 | b4 | sex of child | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1993 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 1996 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 1999 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 2004 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 2007 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 2011 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 2014 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 2017 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 2022 | 7 | b5 | child is alive | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 1993 | 8 | b6 | age at death | dbl+lbl | 26546 | 93 | 100 - 399 |
Bangladesh | 1996 | 8 | b6 | age at death | dbl+lbl | 24383 | 91 | 100 - 399 |
Bangladesh | 1999 | 8 | b6 | age at death | dbl+lbl | 26984 | 89 | 100 - 333 |
Bangladesh | 2004 | 8 | b6 | age at death | dbl+lbl | 28912 | 87 | 100 - 331 |
Bangladesh | 2007 | 8 | b6 | age at death | dbl+lbl | 26730 | 89 | 100 - 999 |
Bangladesh | 2011 | 8 | b6 | age at death | dbl+lbl | 41057 | 87 | 100 - 999 |
Bangladesh | 2014 | 8 | b6 | age at death | dbl+lbl | 39772 | 86 | 100 - 330 |
Bangladesh | 2017 | 8 | b6 | age at death | dbl+lbl | 43840 | 86 | 100 - 334 |
Bangladesh | 2022 | 8 | b6 | age at death | dbl+lbl | 60613 | 85 | 100 - 332 |
Bangladesh | 1993 | 9 | b7 | age at death (months-imputed) | dbl | 26538 | 56 | 0 - 348 |
Bangladesh | 1996 | 9 | b7 | age at death (months-imputed) | dbl | 24380 | 56 | 0 - 372 |
Bangladesh | 1999 | 9 | b7 | age at death (months-imputed) | dbl | 26972 | 55 | 0 - 396 |
Bangladesh | 2004 | 9 | b7 | age at death (months-imputed) | dbl | 28911 | 55 | 0 - 372 |
Bangladesh | 2007 | 9 | b7 | age at death (months-imputed) | dbl | 26730 | 58 | 0 - 384 |
Bangladesh | 2011 | 9 | b7 | age at death (months, imputed) | dbl | 41057 | 56 | 0 - 384 |
Bangladesh | 2014 | 9 | b7 | age at death (months, imputed) | dbl | 39770 | 55 | 0 - 360 |
Bangladesh | 2017 | 9 | b7 | age at death (months, imputed) | dbl | 43840 | 56 | 0 - 408 |
Bangladesh | 2022 | 9 | b7 | age at death (months, imputed) | dbl | 60613 | 55 | 0 - 384 |
Bangladesh | 1993 | 10 | b8 | current age of child | dbl | 6043 | 38 | 0 - 36 |
Bangladesh | 1996 | 10 | b8 | current age of child | dbl | 4964 | 38 | 0 - 36 |
Bangladesh | 1999 | 10 | b8 | current age of child | dbl | 4934 | 38 | 0 - 36 |
Bangladesh | 2004 | 10 | b8 | current age of child | dbl | 4686 | 38 | 0 - 36 |
Bangladesh | 2007 | 10 | b8 | current age of child | dbl | 3797 | 38 | 0 - 36 |
Bangladesh | 2011 | 10 | b8 | current age of child | dbl | 4776 | 39 | 0 - 37 |
Bangladesh | 2014 | 10 | b8 | current age of child | dbl | 4002 | 38 | 0 - 36 |
Bangladesh | 2017 | 10 | b8 | current age of child | dbl | 3988 | 37 | 0 - 35 |
Bangladesh | 2022 | 10 | b8 | current age of child | dbl | 4109 | 40 | 0 - 38 |
Bangladesh | 1993 | 11 | b9 | who child lives with | dbl+lbl | 6043 | 3 | 0 - 4 |
Bangladesh | 1996 | 11 | b9 | who child lives with | dbl+lbl | 4964 | 3 | 0 - 4 |
Bangladesh | 1999 | 11 | b9 | who child lives with | dbl+lbl | 4934 | 3 | 0 - 4 |
Bangladesh | 2004 | 11 | b9 | child lives with whom | dbl+lbl | 4686 | 3 | 0 - 4 |
Bangladesh | 2007 | 11 | b9 | child lives with whom | dbl+lbl | 3797 | 3 | 0 - 4 |
Bangladesh | 2011 | 11 | b9 | child lives with whom | dbl+lbl | 4776 | 3 | 0 - 4 |
Bangladesh | 2014 | 11 | b9 | child lives with whom | dbl+lbl | 4002 | 3 | 0 - 4 |
Bangladesh | 2017 | 11 | b9 | child lives with whom | dbl+lbl | 3988 | 3 | 0 - 4 |
Bangladesh | 2022 | 11 | b9 | child lives with whom | dbl+lbl | 4109 | 3 | 0 - 4 |
Bangladesh | 1993 | 12 | b10 | completeness of information | dbl+lbl | 0 | 8 | 1 - 8 |
Bangladesh | 1996 | 12 | b10 | completeness of information | dbl+lbl | 0 | 6 | 1 - 8 |
Bangladesh | 1999 | 12 | b10 | completeness of information | dbl+lbl | 0 | 7 | 1 - 8 |
Bangladesh | 2004 | 12 | b10 | completeness of information | dbl+lbl | 0 | 3 | 1 - 5 |
Bangladesh | 2007 | 12 | b10 | completeness of information | dbl+lbl | 0 | 3 | 1 - 5 |
Bangladesh | 2011 | 12 | b10 | completeness of information | dbl+lbl | 0 | 6 | 1 - 8 |
Bangladesh | 2014 | 12 | b10 | completeness of information | dbl+lbl | 0 | 6 | 1 - 8 |
Bangladesh | 2017 | 12 | b10 | completeness of information | dbl+lbl | 0 | 4 | 0 - 5 |
Bangladesh | 2022 | 12 | b10 | completeness of information | dbl+lbl | 0 | 5 | 0 - 6 |
Bangladesh | 1993 | 13 | b11 | preceding birth interval | dbl | 8577 | 168 | 7 - 270 |
Bangladesh | 1996 | 13 | b11 | preceding birth interval | dbl | 8109 | 164 | 6 - 300 |
Bangladesh | 1999 | 13 | b11 | preceding birth interval | dbl | 9405 | 177 | 9 - 246 |
Bangladesh | 2004 | 13 | b11 | preceding birth interval | dbl | 10189 | 179 | 8 - 231 |
Bangladesh | 2007 | 13 | b11 | preceding birth interval | dbl | 9893 | 196 | 8 - 318 |
Bangladesh | 2011 | 13 | b11 | preceding birth interval (months) | dbl | 16107 | 207 | -3 - 246 |
Bangladesh | 2014 | 13 | b11 | preceding birth interval (months) | dbl | 16181 | 205 | 9 - 267 |
Bangladesh | 2017 | 13 | b11 | preceding birth interval (months) | dbl | 18241 | 215 | 7 - 291 |
Bangladesh | 2022 | 13 | b11 | preceding birth interval (months) | dbl | 27116 | 235 | 0 - 343 |
Bangladesh | 1993 | 14 | b12 | succeeding birth interval | dbl | 8614 | 168 | 7 - 270 |
Bangladesh | 1996 | 14 | b12 | succeeding birth interval | dbl | 8156 | 164 | 6 - 300 |
Bangladesh | 1999 | 14 | b12 | succeeding birth interval | dbl | 9442 | 177 | 9 - 246 |
Bangladesh | 2004 | 14 | b12 | succeeding birth interval | dbl | 10246 | 179 | 8 - 231 |
Bangladesh | 2007 | 14 | b12 | succeeding birth interval | dbl | 9947 | 196 | 8 - 318 |
Bangladesh | 2011 | 14 | b12 | succeeding birth interval (months) | dbl | 16183 | 207 | -3 - 246 |
Bangladesh | 2014 | 14 | b12 | succeeding birth interval (months) | dbl | 16225 | 205 | 9 - 267 |
Bangladesh | 2017 | 14 | b12 | succeeding birth interval (months) | dbl | 18329 | 215 | 7 - 291 |
Bangladesh | 2022 | 14 | b12 | succeeding birth interval (months) | dbl | 27214 | 235 | 0 - 343 |
Bangladesh | 1993 | 15 | b13 | flag for age at death | dbl+lbl | 26538 | 5 | 0 - 8 |
Bangladesh | 1996 | 15 | b13 | flag for age at death | dbl+lbl | 24380 | 6 | 0 - 8 |
Bangladesh | 1999 | 15 | b13 | flag for age at death | dbl+lbl | 26972 | 7 | 0 - 8 |
Bangladesh | 2004 | 15 | b13 | flag for age at death | dbl+lbl | 28911 | 6 | 0 - 8 |
Bangladesh | 2007 | 15 | b13 | flag for age at death | dbl+lbl | 26730 | 6 | 0 - 8 |
Bangladesh | 2011 | 15 | b13 | flag for age at death | dbl+lbl | 41057 | 5 | 0 - 8 |
Bangladesh | 2014 | 15 | b13 | flag for age at death | dbl+lbl | 39770 | 4 | 0 - 8 |
Bangladesh | 2017 | 15 | b13 | flag for age at death | dbl+lbl | 43840 | 3 | 0 - 6 |
Bangladesh | 2022 | 15 | b13 | flag for age at death | dbl+lbl | 60613 | 2 | 0 - 0 |
Bangladesh | 1993 | 16 | b14 | birth interval >= 4 years -na | dbl+lbl | 32581 | 1 | |
Bangladesh | 1996 | 16 | b14 | birth interval >= 4 years | dbl+lbl | 8088 | 3 | 0 - 1 |
Bangladesh | 1999 | 16 | b14 | birth interval >= 4 years -na | dbl | 31906 | 1 | |
Bangladesh | 2004 | 16 | b15 | live birth between births | dbl+lbl | 10173 | 3 | 0 - 1 |
Bangladesh | 2007 | 16 | b15 | live birth between births | dbl+lbl | 9849 | 4 | 0 - 9 |
Bangladesh | 2011 | 16 | b15 | live birth between births | dbl+lbl | 16014 | 4 | 0 - 9 |
Bangladesh | 2014 | 16 | b15 | live birth between births | dbl+lbl | 16087 | 3 | 0 - 1 |
Bangladesh | 2017 | 16 | b15 | live birth between births | dbl+lbl | 18142 | 3 | 0 - 1 |
Bangladesh | 2022 | 16 | b15 | live birth between births | dbl+lbl | 0 | 2 | 0 - 1 |
Bangladesh | 1993 | 17 | b15 | live birth between births -na | dbl+lbl | 32581 | 1 | |
Bangladesh | 1996 | 17 | b15 | live birth between births | dbl+lbl | 25432 | 3 | 0 - 1 |
Bangladesh | 1999 | 17 | b15 | live birth between births | dbl+lbl | 9366 | 2 | 0 - 0 |
Bangladesh | 2004 | 17 | b16 | child's line number in household | dbl+lbl | 4686 | 29 | 0 - 31 |
Bangladesh | 2007 | 17 | b16 | child's line number in household | dbl+lbl | 3797 | 29 | 0 - 28 |
Bangladesh | 2011 | 17 | b16 | child's line number in household | dbl+lbl | 4776 | 31 | 0 - 29 |
Bangladesh | 2014 | 17 | b16 | child's line number in household | dbl+lbl | 4002 | 24 | 0 - 24 |
Bangladesh | 2017 | 17 | b16 | child's line number in household | dbl+lbl | 3988 | 26 | 0 - 29 |
Bangladesh | 2022 | 17 | b16 | child's line number in household | dbl+lbl | 4109 | 25 | 0 - 25 |
Bangladesh | 2017 | 18 | b17 | day of birth | dbl | 0 | 31 | 1 - 31 |
Bangladesh | 2022 | 18 | b17 | day of birth | dbl | 0 | 31 | 1 - 31 |
Bangladesh | 2017 | 19 | b18 | century day code of birth (cdc) | dbl | 0 | 11428 | 29926 - 43167 |
Bangladesh | 2022 | 19 | b18 | century day code of birth (cdc) | dbl | 0 | 11617 | 30713 - 44899 |
Bangladesh | 2017 | 20 | b19 | current age of child in months (months since birth for dead children) | dbl | 0 | 432 | 0 - 431 |
Bangladesh | 2022 | 20 | b19 | current age of child in months (months since birth for dead children) | dbl | 0 | 445 | 0 - 462 |
Bangladesh | 2017 | 21 | b20 | na - duration of pregnancy | dbl | 47828 | 1 | |
Bangladesh | 2022 | 21 | b20 | duration of pregnancy in months | dbl | 0 | 6 | 5 - 10 |
Bangladesh | 2022 | 22 | b21 | duration of pregnancy | dbl | 0 | 22 | 124 - 210 |
From the above table we get an overall snapshot of the birth history variables. We see that the variables b1-b13 are common in all the 9 bdbr datasets. Notably bdbr 2001 and 2006 have some extra variables that are not available in other rounds. Next, we look at the other labelled variables which are common across bdbr in more details. We would like to see if the value labels of the common birth history variables are similar across the bdbr datasets.
b0 - child is twin
We check the value labels of b0 variable that denotes whether the child is twin. First we create a nested tibble of b0’s value labels.
# Create the data dictionary for b0 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_b0 = map(
bdbr_data,
\(df) {|>
df select(b0) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_b0)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "b0", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | b0 | 0 | [0] single birth | [0] single birth | [0] single birth | [0] single birth | [0] single birth | [0] single birth | [0] single birth | [0] single birth | [0] single birth |
Bangladesh | b0 | 1 | [1] 1st of multiple | [1] 1st of multiple | [1] 1st of multiple | [1] 1st of multiple | [1] 1st of multiple | [1] 1st of multiple | [1] 1st of multiple | [1] 1st of multiple | [1] 1st of multiple |
Bangladesh | b0 | 2 | [2] 2nd of multiple | [2] 2nd of multiple | [2] 2nd of multiple | [2] 2nd of multiple | [2] 2nd of multiple | [2] 2nd of multiple | [2] 2nd of multiple | [2] 2nd of multiple | [2] 2nd of multiple |
Bangladesh | b0 | 3 | [3] 3rd of multiple | [3] 3rd of multiple | [3] 3rd of multiple | [3] 3rd of multiple | [3] 3rd of multiple | [3] 3rd of multiple | [3] 3rd of multiple | [3] 3rd of multiple | [3] 3rd of multiple |
Bangladesh | b0 | 4 | [4] 4th of multiple | [4] 4th of multiple | [4] 4th of multiple | [4] 4th of multiple | [4] 4th of multiple | [4] 4th of multiple | [4] 4th of multiple | [4] 4th of multiple | [4] 4th of multiple |
Bangladesh | b0 | 5 | [5] 5th of multiple | [5] 5th of multiple | [5] 5th of multiple | [5] 5th of multiple | [5] 5th of multiple | [5] 5th of multiple | [5] 5th of multiple | [5] 5th of multiple | [5] 5th of multiple |
We can see the value labels of b0 in the above table. We see that the value labels are same across all the bdbr datasets.
b2 - year of birth
We see that the b2 variable has value labels only for bdbr 1999. Therefore, we check the value labels of the variable during this round.
# Create the data dictionary for b2 in bdbr 1999
$bdbr_data$bdbr_1999 |>
bdbr1_pre_tmp0select(b2) |>
look_for(details = "full") |>
lookfor_to_long_format() |>
convert_list_columns_to_character() |>
select(-c(pos, levels, class:n_na)) |>
qflextable() |>
autofit()
variable | label | col_type | missing | value_labels | unique_values | range |
---|---|---|---|---|---|---|
b2 | year of birth | dbl+lbl | 0 | [0] year 2000 | 38 | 0 - 99 |
Note that, the birth year 0 which corresponds to the year 2000 is denoted with the label 2000. That is the only label in the variable.
# Check the distribution of b2 in bdbr 1999
$bdbr_data$bdbr_1999 |>
bdbr1_pre_tmp0tabyl(b2) |>
adorn_totals() |>
adorn_pct_formatting() |>
qflextable() |>
autofit()
b2 | n | percent |
---|---|---|
0 | 85 | 0.3% |
63 | 2 | 0.0% |
64 | 16 | 0.1% |
65 | 48 | 0.2% |
66 | 69 | 0.2% |
67 | 125 | 0.4% |
68 | 147 | 0.5% |
69 | 245 | 0.8% |
70 | 295 | 0.9% |
71 | 426 | 1.3% |
72 | 371 | 1.2% |
73 | 485 | 1.5% |
74 | 584 | 1.8% |
75 | 608 | 1.9% |
76 | 628 | 2.0% |
77 | 744 | 2.3% |
78 | 707 | 2.2% |
79 | 893 | 2.8% |
80 | 846 | 2.7% |
81 | 999 | 3.1% |
82 | 975 | 3.1% |
83 | 1119 | 3.5% |
84 | 1150 | 3.6% |
85 | 1197 | 3.8% |
86 | 1210 | 3.8% |
87 | 1384 | 4.3% |
88 | 1360 | 4.3% |
89 | 1489 | 4.7% |
90 | 1340 | 4.2% |
91 | 1370 | 4.3% |
92 | 1488 | 4.7% |
93 | 1411 | 4.4% |
94 | 1255 | 3.9% |
95 | 1365 | 4.3% |
96 | 1319 | 4.1% |
97 | 1383 | 4.3% |
98 | 1440 | 4.5% |
99 | 1328 | 4.2% |
Total | 31906 | 100.0% |
For more clarity we checked the univariate distribution of birth year above in Table 11. It is clear that the survey years during 1900’s have been represented by denoting just the last two digits and the year 2000 is represented by 0.
b4 - sex of child
We check the value labels of b4 variable which gives the sex of the child. First we create a nested tibble of b4’s value labels.
# Create the data dictionary for b4 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_b4 = map(
bdbr_data,
\(df) {|>
df select(b4) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_b4)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "b4", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | b4 | 1 | [1] male | [1] male | [1] male | [1] male | [1] male | [1] male | [1] male | [1] male | [1] male |
Bangladesh | b4 | 2 | [2] female | [2] female | [2] female | [2] female | [2] female | [2] female | [2] female | [2] female | [2] female |
We can see the value labels of b4 in the above table. The value labels are same across all the bdbr datasets.
b5 - child is alive
We check the value labels of b5 variable which gives the survival status of the child. First we create a nested tibble of b5’s value labels.
# Create the data dictionary for b5 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_b5 = map(
bdbr_data,
\(df) {|>
df select(b5) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_b5)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "b5", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | b5 | 0 | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no |
Bangladesh | b5 | 1 | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes |
The above table shows that the value labels of survival status of child are same across all the bdbr datasets.
b6 - age at death
We check the value labels of b6 variable which shows the age at death of children. Note that this variable has many missing values across all bdbr rounds as not all children experienced mortality throughout their lifetime. First we create a nested tibble of b6’s value labels.
# Create the data dictionary for b5 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_b6 = map(
bdbr_data,
\(df) {|>
df select(b6) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_b6)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "b6", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | b6 | 100 | [100] died on day of birth | [100] died on day of birth | [100] died on day of birth | [100] died on day of birth | |||||
Bangladesh | b6 | 101 | [101] days: 1 | [101] days: 1 | [101] days: 1 | [101] days: 1 | |||||
Bangladesh | b6 | 199 | [199] days, missing | [199] days: number missing | [199] days: number missing | [199] days: number missing | [199] days: number missing | ||||
Bangladesh | b6 | 201 | [201] months: 1 | [201] months: 1 | [201] months: 1 | [201] months: 1 | |||||
Bangladesh | b6 | 299 | [299] months, missing | [299] months: number missing | [299] months: number missing | [299] months: number missing | [299] months: number missing | ||||
Bangladesh | b6 | 301 | [301] years: 1 | [301] years: 1 | [301] years: 1 | [301] years: 1 | |||||
Bangladesh | b6 | 399 | [399] years, missing | [399] years: number missing | [399] years: number missing | [399] years: number missing | [399] years: number missing | ||||
Bangladesh | b6 | 997 | [997] inconsistent | [997] inconsistent | [997] inconsistent | [997] inconsistent | [997] inconsistent | [997] inconsistent | [997] inconsistent | [997] inconsistent | [997] inconsistent |
Bangladesh | b6 | 998 | [998] don't know | [998] don't know | [998] don't know | [998] don't know | [998] don't know | [998] don't know | [998] don't know | [998] don't know | [998] don't know |
The above table shows that the value labels of age at death of child are in two groups. First, they are same for bdbr 1996, 2001 and 2006 and and then for bdbr 2011, 2016 and 2022.
b9 - child lives with whom
We check the value labels of b9 variable which gives info on who the child lives with. First we create a nested tibble of b9’s value labels.
# Create the data dictionary for b9 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_b9 = map(
bdbr_data,
\(df) {|>
df select(b9) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_b9)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "b9", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | b9 | 0 | [0] respondent | [0] respondent | [0] respondent | [0] respondent | [0] respondent | [0] respondent | [0] respondent | [0] respondent | [0] respondent |
Bangladesh | b9 | 1 | [1] father | [1] father | [1] father | [1] father | [1] father | [1] father | [1] father | [1] father | [1] father |
Bangladesh | b9 | 2 | [2] other relative | [2] other relative | [2] other relative | [2] other relative | [2] other relative | [2] other relative | [2] other relative | [2] other relative | [2] other relative |
Bangladesh | b9 | 3 | [3] someone else | [3] someone else | [3] someone else | [3] someone else | [3] someone else | [3] someone else | [3] someone else | [3] someone else | [3] someone else |
Bangladesh | b9 | 4 | [4] lives elsewhere | [4] lives elsewhere | [4] lives elsewhere | [4] lives elsewhere | [4] lives elsewhere | [4] lives elsewhere | [4] lives elsewhere | [4] lives elsewhere | [4] lives elsewhere |
We can see in the above table that the value labels of b9 are same across all the bdbr datasets.
b10 - completeness of information
We check the value labels of b10 variable which gives the completeness of birth history information. First we create a nested tibble of b10’s value labels.
# Create the data dictionary for b10 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_b10 = map(
bdbr_data,
\(df) {|>
df select(b10) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_b10)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "b10", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | b10 | 0 | [0] month, year and day | [0] month, year and day | |||||||
Bangladesh | b10 | 1 | [1] month and year | [1] month and year | [1] month and year | [1] month and year | [1] month and year | [1] month and year - information complete | [1] month and year - information complete | [1] month and year - information complete | [1] month and year - information complete |
Bangladesh | b10 | 2 | [2] month and age -y imp | [2] month and age -y imp | [2] month and age -y imp | [2] month and age -y imp | [2] month and age -y imp | [2] month and age - year imputed | [2] month and age - year imputed | [2] month and age - year imputed | [2] month and age - year imputed |
Bangladesh | b10 | 3 | [3] year and age - m imp | [3] year and age - m imp | [3] year and age - m imp | [3] year and age - m imp | [3] year and age - m imp | [3] year and age - month imputed | [3] year and age - month imputed | [3] year and age - month imputed | [3] year and age - month imputed |
Bangladesh | b10 | 4 | [4] y & age - y ignored | [4] y & age - y ignored | [4] y & age - y ignored | [4] y & age - y ignored | [4] y & age - y ignored | [4] year and age - year ignored | [4] year and age - year ignored | [4] year and age - year ignored | [4] year and age - year ignored |
Bangladesh | b10 | 5 | [5] year - a, m imp | [5] year - a, m imp | [5] year - a, m imp | [5] year - a, m imp | [5] year - a, m imp | [5] year - age/month imputed | [5] year - age/month imputed | [5] year - age/month imputed | [5] year - age/month imputed |
Bangladesh | b10 | 6 | [6] age - y, m imp | [6] age - y, m imp | [6] age - y, m imp | [6] age - y, m imp | [6] age - y, m imp | [6] age - year/month imputed | [6] age - year/month imputed | [6] age - year/month imputed | [6] age - year/month imputed |
Bangladesh | b10 | 7 | [7] month - a, y imp | [7] month - a, y imp | [7] month - a, y imp | [7] month - a, y imp | [7] month - a, y imp | [7] month - age/year imputed | [7] month - age/year imputed | [7] month - age/year imputed | [7] month - age/year imputed |
Bangladesh | b10 | 8 | [8] none - all imp | [8] none - all imp | [8] none - all imp | [8] none - all imp | [8] none - all imp | [8] none - all imputed | [8] none - all imputed | [8] none - all imputed | [8] none - all imputed |
We can see in the above table that the value labels of b10 are same across bdbr 1996, 2001, 2006 and 2011 datasets. Then they are same for bdbr 2016 and 2022.
Checking the Common independent variables before harmonization
Next we start documenting the common independent variables. First we will check the data dictionary of the common independent variables. Then we will check them variable wise.
# We check the common independent vars in all bdbr datasets.
# First we create the data dictionary in nested tibble.
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_comindvars = map(
bdbr_data,
\(df) {|>
df # select the common independent variables
select(v106, v011, v501, v701, v025, v151, v152) |>
lookfor(details = "full") |>
select(-c(levels:n_na)) |>
# For correctly viewing the range column in data dictionary
convert_list_columns_to_character()
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_comindvars)) |>
arrange(pos)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | svy_year | pos | variable | label | col_type | missing | unique_values | range |
---|---|---|---|---|---|---|---|---|
Bangladesh | 1993 | 1 | v106 | highest educational level | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 1996 | 1 | v106 | highest educational level | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 1999 | 1 | v106 | highest educational level | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2004 | 1 | v106 | highest educational level | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2007 | 1 | v106 | highest educational level | dbl+lbl | 0 | 5 | 0 - 9 |
Bangladesh | 2011 | 1 | v106 | highest educational level | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2014 | 1 | v106 | highest educational level | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2017 | 1 | v106 | highest educational level | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 2022 | 1 | v106 | highest educational level | dbl+lbl | 0 | 4 | 0 - 3 |
Bangladesh | 1993 | 2 | v011 | date of birth (cmc) | dbl | 0 | 421 | 529 - 949 |
Bangladesh | 1996 | 2 | v011 | date of birth (cmc) | dbl | 0 | 417 | 565 - 987 |
Bangladesh | 1999 | 2 | v011 | date of birth (cmc) | dbl | 0 | 423 | 600 - 1022 |
Bangladesh | 2004 | 2 | v011 | date of birth (cmc) | dbl | 0 | 424 | 650 - 1073 |
Bangladesh | 2007 | 2 | v011 | date of birth (cmc) | dbl | 0 | 423 | 688 - 1111 |
Bangladesh | 2011 | 2 | v011 | date of birth (cmc) | dbl | 0 | 424 | 740 - 1163 |
Bangladesh | 2014 | 2 | v011 | date of birth (cmc) | dbl | 0 | 422 | 776 - 1197 |
Bangladesh | 2017 | 2 | v011 | date of birth (cmc) | dbl | 0 | 422 | 815 - 1237 |
Bangladesh | 2022 | 2 | v011 | date of birth (cmc) | dbl | 0 | 423 | 871 - 1293 |
Bangladesh | 1993 | 3 | v501 | current marital status | dbl+lbl | 0 | 3 | 1 - 4 |
Bangladesh | 1996 | 3 | v501 | current marital status | dbl+lbl | 0 | 3 | 1 - 4 |
Bangladesh | 1999 | 3 | v501 | current marital status | dbl+lbl | 0 | 4 | 1 - 5 |
Bangladesh | 2004 | 3 | v501 | current marital status | dbl+lbl | 0 | 4 | 1 - 5 |
Bangladesh | 2007 | 3 | v501 | current marital status | dbl+lbl | 0 | 4 | 1 - 5 |
Bangladesh | 2011 | 3 | v501 | current marital status | dbl+lbl | 0 | 4 | 1 - 5 |
Bangladesh | 2014 | 3 | v501 | current marital status | dbl+lbl | 0 | 4 | 1 - 5 |
Bangladesh | 2017 | 3 | v501 | current marital status | dbl+lbl | 0 | 4 | 1 - 5 |
Bangladesh | 2022 | 3 | v501 | current marital status | dbl+lbl | 0 | 4 | 1 - 5 |
Bangladesh | 1993 | 4 | v701 | partner's education level | dbl+lbl | 17 | 6 | 0 - 8 |
Bangladesh | 1996 | 4 | v701 | partner's education level | dbl+lbl | 35 | 6 | 0 - 8 |
Bangladesh | 1999 | 4 | v701 | partner's education level | dbl+lbl | 47 | 6 | 0 - 8 |
Bangladesh | 2004 | 4 | v701 | partner's education level | dbl+lbl | 35 | 5 | 0 - 3 |
Bangladesh | 2007 | 4 | v701 | partner's education level | dbl+lbl | 10 | 6 | 0 - 9 |
Bangladesh | 2011 | 4 | v701 | husband/partner's education level | dbl+lbl | 0 | 5 | 0 - 9 |
Bangladesh | 2014 | 4 | v701 | husband/partner's education level | dbl+lbl | 3 | 5 | 0 - 3 |
Bangladesh | 2017 | 4 | v701 | husband/partner's education level | dbl+lbl | 2935 | 6 | 0 - 8 |
Bangladesh | 2022 | 4 | v701 | husband/partner's education level | dbl+lbl | 23811 | 6 | 0 - 8 |
Bangladesh | 1993 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1996 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1999 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2004 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2007 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2011 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2014 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2017 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2022 | 5 | v025 | type of place of residence | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1993 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1996 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1999 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2004 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2007 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2011 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2014 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2017 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 2022 | 6 | v151 | sex of household head | dbl+lbl | 0 | 2 | 1 - 2 |
Bangladesh | 1993 | 7 | v152 | age of household head | dbl+lbl | 0 | 83 | 11 - 98 |
Bangladesh | 1996 | 7 | v152 | age of household head | dbl | 1 | 79 | 9 - 98 |
Bangladesh | 1999 | 7 | v152 | age of household head | dbl | 12 | 79 | 11 - 95 |
Bangladesh | 2004 | 7 | v152 | age of household head | dbl+lbl | 0 | 82 | 10 - 98 |
Bangladesh | 2007 | 7 | v152 | age of household head | dbl+lbl | 0 | 79 | 14 - 99 |
Bangladesh | 2011 | 7 | v152 | age of household head | dbl+lbl | 0 | 80 | 13 - 95 |
Bangladesh | 2014 | 7 | v152 | age of household head | dbl+lbl | 0 | 75 | 12 - 95 |
Bangladesh | 2017 | 7 | v152 | age of household head | dbl+lbl | 0 | 78 | 15 - 95 |
Bangladesh | 2022 | 7 | v152 | age of household head | dbl+lbl | 0 | 77 | 15 - 95 |
From the above table we get an overall snapshot of the common independent variables. We see that majority of the have different number of value labels across the bdbr datasets. Only v025 and v151 have the same number of value labels across bdbr rounds. Next, we look at the labelled class variables among these common variables in more details. We would like to see if the value labels and codes of the common independent variables are similar across the bdbr datasets.
v106 - Mother’s education level
We check the value labels of v106 variable that denotes the highest education level of mother. First we create a nested tibble of v106’s value labels.
# Create the data dictionary for v106 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v106 = map(
bdbr_data,
\(df) {|>
df select(v106) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_v106)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "v106", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | v106 | 0 | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education |
Bangladesh | v106 | 1 | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary |
Bangladesh | v106 | 2 | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary |
Bangladesh | v106 | 3 | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher |
We can see the value labels of v106 are similar across all the bdbr datasets. Although bdbr 2007 had 5 values as seen in Table 17, it has 4 value labels.
v011 - Date of birth (in CMC)
The v011 variable, which has the dob of mothers in cmc, is a numeric variable. Let’s check the range of these values in further details such as checking for outliers. First let’s create a nested tibble of the summary statistics of v011 variable.
# Create the summary statistics for v011 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(skim_v011 = map(
bdbr_data,
\(df) {|>
df select(v011) |>
skim_without_charts() |>
as_tibble() |>
select(-c(skim_type, complete_rate)) |>
rename(
variable = 1,
n_miss = 2,
mean = 3,
sd = 4,
min = 5,
p25 = 6,
p50 = 7,
p75 = 8,
max = 9
)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(skim_v011)) |>
# Make variable values have one decimal point
mutate(
mean = sprintf("%.1f", mean),
sd = sprintf("%.1f", sd)
)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | svy_year | variable | n_miss | mean | sd | min | p25 | p50 | p75 | max |
---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | 1993 | v011 | 0 | 712.9 | 99.2 | 529 | 633 | 712 | 792 | 949 |
Bangladesh | 1996 | v011 | 0 | 750.0 | 100.4 | 565 | 668 | 748 | 829 | 987 |
Bangladesh | 1999 | v011 | 0 | 778.1 | 102.1 | 600 | 693 | 776 | 856 | 1022 |
Bangladesh | 2004 | v011 | 0 | 826.7 | 103.7 | 650 | 740 | 824 | 908 | 1073 |
Bangladesh | 2007 | v011 | 0 | 860.8 | 103.6 | 688 | 776 | 857 | 943 | 1111 |
Bangladesh | 2011 | v011 | 0 | 914.7 | 102.6 | 740 | 829 | 912 | 998 | 1163 |
Bangladesh | 2014 | v011 | 0 | 948.2 | 100.8 | 776 | 862 | 947 | 1027 | 1197 |
Bangladesh | 2017 | v011 | 0 | 983.7 | 100.6 | 815 | 897 | 981 | 1060 | 1237 |
Bangladesh | 2022 | v011 | 0 | 1041.6 | 96.5 | 871 | 963 | 1038 | 1115 | 1293 |
v501 - Mother’s marital status
We check the value labels of v501 variable which gives the current marital status of mother. First we create a nested tibble of v501’s value labels.
# Create the data dictionary for v501 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v501 = map(
bdbr_data,
\(df) {|>
df select(v501) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_v501)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "v501", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | v501 | 0 | [0] never married | [0] never married | [0] never married | [0] never married | [0] never married | [0] never in union | [0] never in union | [0] never in union | [0] never in union |
Bangladesh | v501 | 1 | [1] married | [1] married | [1] married | [1] married | [1] married | [1] married | [1] married | [1] married | [1] married |
Bangladesh | v501 | 2 | [2] living together | [2] living together | [2] living together | [2] living together | [2] living together | [2] living with partner | [2] living with partner | [2] living with partner | [2] living with partner |
Bangladesh | v501 | 3 | [3] widowed | [3] widowed | [3] widowed | [3] widowed | [3] widowed | [3] widowed | [3] widowed | [3] widowed | [3] widowed |
Bangladesh | v501 | 4 | [4] divorced | [4] divorced | [4] divorced | [4] divorced | [4] divorced | [4] divorced | [4] divorced | [4] divorced | [4] divorced |
Bangladesh | v501 | 5 | [5] not living together | [5] not living together | [5] not living together | [5] not living together | [5] not living together | [5] no longer living together/separated | [5] no longer living together/separated | [5] no longer living together/separated | [5] no longer living together/separated |
All the bdbr rounds have the same number of value label codes. The bdbr 1993, 1996, 1999, 2004 and 2007 have the same set of value label texts. Then bdbr 2011, 2014, 2017 and 2022 have another set of similar value labels.
v701 - Husband/Partner’s education level
We check the value labels of v701 variable which gives the current marital status of mother. First we create a nested tibble of v701’s value labels.
# Create the data dictionary for v701 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v701 = map(
bdbr_data,
\(df) {|>
df select(v701) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_v701)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "v701", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | v701 | 0 | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education | [0] no education |
Bangladesh | v701 | 1 | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary | [1] primary |
Bangladesh | v701 | 2 | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary | [2] secondary |
Bangladesh | v701 | 3 | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher | [3] higher |
Bangladesh | v701 | 8 | [8] don't know | [8] don't know | [8] don't know | [8] don't know | [8] don't know | [8] don't know | [8] don't know | [8] don't know | [8] don't know |
We see that all the bdbr rounds have 5 value labels and they are same across all rounds.
v025 - Type of place of residence
We check the value labels of v025 variable which shows if a household belongs to rural or urban psu. First we create a nested tibble of v025’s value labels.
# Create the data dictionary for v025 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v025 = map(
bdbr_data,
\(df) {|>
df select(v025) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(c(ctr_name, svy_year, lookfor_v025)) |>
unnest(cols = c(lookfor_v025)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "v025", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | v025 | 1 | [1] urban | [1] urban | [1] urban | [1] urban | [1] urban | [1] urban | [1] urban | [1] urban | [1] urban |
Bangladesh | v025 | 2 | [2] rural | [2] rural | [2] rural | [2] rural | [2] rural | [2] rural | [2] rural | [2] rural | [2] rural |
The values labels and codes for v025 are same across all the bdbr rounds.
v151 - Sex of household head
We check the value labels of v151 variable which gives the sex of the household head. First we create a nested tibble of v151’s value labels.
# Create the data dictionary for v151 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v151 = map(
bdbr_data,
\(df) {|>
df select(v151) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_v151)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "v151", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | v151 | 1 | [1] male | [1] male | [1] male | [1] male | [1] male | [1] male | [1] male | [1] male | [1] male |
Bangladesh | v151 | 2 | [2] female | [2] female | [2] female | [2] female | [2] female | [2] female | [2] female | [2] female | [2] female |
The values labels and codes for v151 are same across all the bdbr rounds.
v152 - Age of household head
Interestingly, we see v152 (a continuous variable) has value labels for all rounds except bdbr 1996 and 1999. Therefore, we check the value labels of v152 for those rounds. First we create a nested tibble of v152’s value labels.
# Create the data dictionary for v152 in nested tibble
<- bdbr1_pre_tmp0 |>
bdbr1_pre_tmp1 mutate(lookfor_v152 = map(
bdbr_data,
\(df) {|>
df select(v152) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdbr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdbr1_pre_tmp1 |>
bdbr1_pre_tmp2 # First we select the required cols and unnest()
select(-c(unf, bdbr_data, n_births)) |>
unnest(cols = c(lookfor_v152)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdbr_"
|>
) # Show the variable name in a col
mutate(var_name = "v152", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdbr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdbr_1993 | bdbr_1996 | bdbr_1999 | bdbr_2004 | bdbr_2007 | bdbr_2011 | bdbr_2014 | bdbr_2017 | bdbr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | v152 | 96 | [96] 96+ | ||||||||
Bangladesh | v152 | 97 | [97] 97+ | [97] 97+ | [97] 97+ | [97] 97+ | [97] 97+ | [97] 97+ | |||
Bangladesh | v152 | 98 | [98] dk | [98] dk | [98] don't know | [98] don't know | [98] don't know | [98] don't know | |||
Bangladesh | v152 |
We can see that the value labels of v152 are mostly for missing values. However, since v152 has no missing values across the bdbr rounds, we need not be concerned about them.
Bangladesh HH dataset use for variable creation
Checking the ID variables before harmonization
Here we check the formatting of the constituent variables with which we will prepare the ID variables for the pooled Bangladesh household recode (hr) dataset. We will use the following constituent variables for creating the ID variables for the pooled dataset:
# We check the var type of ID vars in all bdhr datasets.
# First we create a data dictionary of the bdhr datasets in nested tibble.
<- bdhr1_pre_tmp0 |>
bdhr1_pre_tmp1 mutate(lookfor_idvars = map(
bdhr_data,
\(df) {|>
df select(hv001, hv002) |>
lookfor(details = "full") |>
select(-c(levels:n_na)) |>
# For correctly viewing the range column in data dictionary
convert_list_columns_to_character()
}
)) bdhr1_pre_tmp1
# Now we unnest the tibble and output the pooled data dictionary
<- bdhr1_pre_tmp1 |>
bdhr1_pre_tmp2 select(c(ctr_name, svy_year, lookfor_idvars)) |>
unnest(cols = c(lookfor_idvars)) |>
arrange(pos)
# Convert and view the tibble as flextable
|>
bdhr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | svy_year | pos | variable | label | col_type | missing | unique_values | range |
---|---|---|---|---|---|---|---|---|
Bangladesh | 1993 | 1 | hv001 | cluster number | dbl | 0 | 301 | 101 - 573 |
Bangladesh | 1996 | 1 | hv001 | cluster number | dbl | 0 | 313 | 101 - 630 |
Bangladesh | 1999 | 1 | hv001 | cluster number | dbl | 0 | 341 | 3 - 500 |
Bangladesh | 2004 | 1 | hv001 | cluster number | dbl | 0 | 361 | 1 - 550 |
Bangladesh | 2007 | 1 | hv001 | cluster number | dbl | 0 | 361 | 1 - 361 |
Bangladesh | 2011 | 1 | hv001 | cluster number | dbl | 0 | 600 | 1 - 600 |
Bangladesh | 2014 | 1 | hv001 | cluster number | dbl | 0 | 600 | 1 - 600 |
Bangladesh | 2017 | 1 | hv001 | cluster number | dbl | 0 | 672 | 1 - 675 |
Bangladesh | 2022 | 1 | hv001 | cluster number | dbl | 0 | 674 | 1 - 675 |
Bangladesh | 1993 | 2 | hv002 | household number | dbl | 0 | 544 | 1 - 615 |
Bangladesh | 1996 | 2 | hv002 | household number | dbl | 0 | 549 | 1 - 658 |
Bangladesh | 1999 | 2 | hv002 | household number | dbl | 0 | 500 | 1 - 547 |
Bangladesh | 2004 | 2 | hv002 | household number | dbl | 0 | 246 | 1 - 290 |
Bangladesh | 2007 | 2 | hv002 | household number | dbl | 0 | 198 | 1 - 252 |
Bangladesh | 2011 | 2 | hv002 | household number | dbl | 0 | 184 | 1 - 217 |
Bangladesh | 2014 | 2 | hv002 | household number | dbl | 0 | 204 | 1 - 222 |
Bangladesh | 2017 | 2 | hv002 | household number | dbl | 0 | 255 | 1 - 299 |
Bangladesh | 2022 | 2 | hv002 | household number | dbl | 0 | 206 | 1 - 225 |
From the above we can see that both the hv001 and hv002 are of numeric class with no missing values. These variables can be used for preparing the ID variables after finding the maximum length of their largest value. Note that survey year is also a constituent ID variable of 4-digits and we need not check it.
# We thought to process the above nested tibble further by decomposing the
# "range" col into min and max values using separate_wider_regex().
# However, we hit a roadblock as pattern did not identify the max values in
# some bdhr rounds correctly
<- bdhr1_pre_tmp0 |>
bdhr1_pre_tmp3 # Generate the summary stats for id vars
mutate(skim_idvars = map(bdhr_data, \(df) {
|>
df select(hv001, hv002) |>
skim_without_charts()
|>
})) # Pool the summary stats for all bdhr rounds
select(c(ctr_name, svy_year, skim_idvars)) |>
unnest(cols = c(skim_idvars)) |>
arrange(skim_variable, svy_year) |>
# Group and generate the max and min values for each variable
group_by(variable = skim_variable) |>
summarize(
min_val = min(numeric.p0),
max_val = max(numeric.p100)
|>
) # calculate the num of digits in the maximum values
mutate(
max_digits = nchar(as.character(max_val))
|>
) # add variable labels and relocate it after variable name
bind_cols(vlabel = c("cluster number", "household number")) |>
relocate(vlabel, .after = 1)
# Convert the tibble to flextable for easy viewing
|>
bdhr1_pre_tmp3 qflextable() |>
align(align = "left", part = "all") |>
autofit()
variable | vlabel | min_val | max_val | max_digits |
---|---|---|---|---|
hv001 | cluster number | 1 | 675 | 3 |
hv002 | household number | 1 | 658 | 3 |
The above table gives the required length of the constituent ID variables to be set, so that we can correctly concatenate them to create the ID variables. The required length of the ID variables are given in max_digits column. Note that survey year is also a constituent ID variable of 4-digits.
Checking HH-level variables before harmonization
Here we check the wealth quintile variable before harmonizing them. Note in Bangladesh 1993, 1996 and 1999 the wealth quintile variables are provided in separate datasets. Therefore we join those variables to the hh file before proceeding with the checking.
Upon manually checking the full data dictionaries we find the variable names. Now we will check the data dictionary of these hh-level variables. Then we will check their value labels variable wise.
# We check the hh-level vars in all bdhr datasets.
# First we create the data dictionary in nested tibble.
<- bdhr1_pre_tmp0 |>
bdhr1_pre_tmp1 mutate(lookfor_hhvars = map(
bdhr_data,
\(df) {|>
df # select the common independent variables
select(
matches("^wlthind5$|^hv270$")
|>
) lookfor(details = "full") |>
select(-c(levels:n_na)) |>
# For correctly viewing the range column in data dictionary
convert_list_columns_to_character()
}
)) bdhr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdhr1_pre_tmp1 |>
bdhr1_pre_tmp2 select(c(svy_year, lookfor_hhvars)) |>
unnest(cols = c(lookfor_hhvars)) |>
arrange(pos, svy_year)
# Convert the tibble to flextable for easy viewing
|>
bdhr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
svy_year | pos | variable | label | col_type | missing | unique_values | range |
---|---|---|---|---|---|---|---|
1993 | 1 | wlthind5 | quintiles of wealth index | dbl+lbl | 0 | 5 | 1 - 5 |
1996 | 1 | wlthind5 | quintiles of wealth index | dbl+lbl | 0 | 5 | 1 - 5 |
1999 | 1 | wlthind5 | quintiles of wealth index | dbl+lbl | 0 | 5 | 1 - 5 |
2004 | 1 | hv270 | wealth index | dbl+lbl | 0 | 5 | 1 - 5 |
2007 | 1 | hv270 | wealth index | dbl+lbl | 0 | 5 | 1 - 5 |
2011 | 1 | hv270 | wealth index | dbl+lbl | 0 | 5 | 1 - 5 |
2014 | 1 | hv270 | wealth index | dbl+lbl | 0 | 5 | 1 - 5 |
2017 | 1 | hv270 | wealth index combined | dbl+lbl | 0 | 5 | 1 - 5 |
2022 | 1 | hv270 | wealth index combined | dbl+lbl | 0 | 5 | 1 - 5 |
The above table gives an overall snapshot of the hh-level variables. All the variables are of labelled class and have the same number of value labels across all the bdhr datasets. Next, we compare the value labels of the wealth quintile variable across the bdhr datasets.
Wealth index quintile variable
Next, we check the value labels of the household wealth quintile variable. The variable names of this variable differs across the bdbr datasets. First we create a nested tibble of the value labels.
# Create the data dictionary in nested tibble
<- bdhr1_pre_tmp0 |>
bdhr1_pre_tmp1 mutate(lookfor_wiqt = map(
bdhr_data,
\(df) {|>
df select(matches("^wlthind5$|^hv270$")) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
}
)) bdhr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdhr1_pre_tmp1 |>
bdhr1_pre_tmp2 # First we select the required cols and unnest()
select(c(ctr_name, svy_year, lookfor_wiqt)) |>
unnest(cols = c(lookfor_wiqt)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdhr_"
|>
) # Show the variable name in a col
mutate(var_name = "Wealth index quintiles", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdhr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdhr_1993 | bdhr_1996 | bdhr_1999 | bdhr_2004 | bdhr_2007 | bdhr_2011 | bdhr_2014 | bdhr_2017 | bdhr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | Wealth index quintiles | 0 | [0] | [0] | [0] | ||||||
Bangladesh | Wealth index quintiles | 1 | [1] lowest quintile | [1] lowest quintile | [1] lowest quintile | [1] poorest | [1] poorest | [1] poorest | [1] poorest | [1] poorest | [1] poorest |
Bangladesh | Wealth index quintiles | 2 | [2] second quintile | [2] second quintile | [2] second quintile | [2] poorer | [2] poorer | [2] poorer | [2] poorer | [2] poorer | [2] poorer |
Bangladesh | Wealth index quintiles | 3 | [3] middle quintile | [3] middle quintile | [3] middle quintile | [3] middle | [3] middle | [3] middle | [3] middle | [3] middle | [3] middle |
Bangladesh | Wealth index quintiles | 4 | [4] fourth quintile | [4] fourth quintile | [4] fourth quintile | [4] richer | [4] richer | [4] richer | [4] richer | [4] richer | [4] richer |
Bangladesh | Wealth index quintiles | 5 | [5] highest quintile | [5] highest quintile | [5] highest quintile | [5] richest | [5] richest | [5] richest | [5] richest | [5] richest | [5] richest |
The value labels are similar across the bdhr rounds with minor differences. Note, the value labels for bdhr 1993, 1996 and 1996 have an empty label code 0. The value label texts are similar for the set of bdhr 1993, 1996 and 1996, then they are same for bdhr 2004, 2007, 2011, 2014, 2017 and 2022 rounds. Therefore, we need to be mindful of this during harmonization.
Bangladesh PR dataset use for family structure variables creation
Checking the ID variables before harmonization
Here we check the formatting of the constituent variables with which we will prepare the ID variables for the pooled Bangladesh person recode (pr) dataset. We will use the following constituent variables for creating the ID variables for the pooled dataset:
# We check the var type of ID vars in all bdpr datasets.
# First we create a data dictionary of the bdpr datasets in nested tibble.
<- bdpr1_pre_tmp0 |>
bdpr1_pre_tmp1 mutate(lookfor_idvars = map(bdpr_data, \(df) {
|>
df select(hv001, hv002, hvidx) |>
lookfor(details = "full") |>
select(-c(levels:n_na)) |>
# For correctly viewing the range column in data dictionary
convert_list_columns_to_character()
})) bdpr1_pre_tmp1
# Now we unnest the tibble and output the pooled data dictionary
<- bdpr1_pre_tmp1 |>
bdpr1_pre_tmp2 select(c(ctr_name, svy_year, lookfor_idvars)) |>
unnest(cols = c(lookfor_idvars)) |>
arrange(pos)
# Convert and view the tibble as flextable
|>
bdpr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | svy_year | pos | variable | label | col_type | missing | unique_values | range |
---|---|---|---|---|---|---|---|---|
Bangladesh | 1993 | 1 | hv001 | cluster number | dbl | 0 | 301 | 101 - 573 |
Bangladesh | 1996 | 1 | hv001 | cluster number | dbl | 0 | 313 | 101 - 630 |
Bangladesh | 1999 | 1 | hv001 | cluster number | dbl | 0 | 341 | 3 - 500 |
Bangladesh | 2004 | 1 | hv001 | cluster number | dbl | 0 | 361 | 1 - 550 |
Bangladesh | 2007 | 1 | hv001 | cluster number | dbl | 0 | 361 | 1 - 361 |
Bangladesh | 2011 | 1 | hv001 | cluster number | dbl | 0 | 600 | 1 - 600 |
Bangladesh | 2014 | 1 | hv001 | cluster number | dbl | 0 | 600 | 1 - 600 |
Bangladesh | 2017 | 1 | hv001 | cluster number | dbl | 0 | 672 | 1 - 675 |
Bangladesh | 2022 | 1 | hv001 | cluster number | dbl | 0 | 674 | 1 - 675 |
Bangladesh | 1993 | 2 | hv002 | household number | dbl | 0 | 544 | 1 - 615 |
Bangladesh | 1996 | 2 | hv002 | household number | dbl | 0 | 549 | 1 - 658 |
Bangladesh | 1999 | 2 | hv002 | household number | dbl | 0 | 500 | 1 - 547 |
Bangladesh | 2004 | 2 | hv002 | household number | dbl | 0 | 246 | 1 - 290 |
Bangladesh | 2007 | 2 | hv002 | household number | dbl | 0 | 198 | 1 - 252 |
Bangladesh | 2011 | 2 | hv002 | household number | dbl | 0 | 184 | 1 - 217 |
Bangladesh | 2014 | 2 | hv002 | household number | dbl | 0 | 204 | 1 - 222 |
Bangladesh | 2017 | 2 | hv002 | household number | dbl | 0 | 255 | 1 - 299 |
Bangladesh | 2022 | 2 | hv002 | household number | dbl | 0 | 206 | 1 - 225 |
Bangladesh | 1993 | 3 | hvidx | line number | dbl | 0 | 28 | 1 - 28 |
Bangladesh | 1996 | 3 | hvidx | line number | dbl | 0 | 29 | 1 - 29 |
Bangladesh | 1999 | 3 | hvidx | line number | dbl | 0 | 26 | 1 - 26 |
Bangladesh | 2004 | 3 | hvidx | line number | dbl | 0 | 33 | 1 - 33 |
Bangladesh | 2007 | 3 | hvidx | line number | dbl | 0 | 40 | 1 - 40 |
Bangladesh | 2011 | 3 | hvidx | line number | dbl | 0 | 31 | 1 - 31 |
Bangladesh | 2014 | 3 | hvidx | line number | dbl | 0 | 25 | 1 - 25 |
Bangladesh | 2017 | 3 | hvidx | line number | dbl | 0 | 30 | 1 - 30 |
Bangladesh | 2022 | 3 | hvidx | line number | dbl | 0 | 25 | 1 - 25 |
From the above table we can see that all the three constituent ID variables are of numeric class with no missing values. These variables can directly be used for preparing the ID variables after finding the maximum length of their largest value. Note that survey year is also a constituent ID variable of 4-digits and we need not check it.
# We thought to process the above nested tibble further by decomposing the
# "range" col into min and max values using separate_wider_regex().
# However, we hit a roadblock as pattern did not identify the max values in
# some bdpr rounds correctly
<- bdpr1_pre_tmp0 |>
bdpr1_pre_tmp3 # Generate the summary stats for id vars
mutate(skim_idvars = map(bdpr_data, \(df) {
|>
df select(hv001, hv002, hvidx) |>
skim_without_charts()
|>
})) # Pool the summary stats for all bdpr rounds
select(c(ctr_name, svy_year, skim_idvars)) |>
unnest(cols = c(skim_idvars)) |>
arrange(skim_variable, svy_year) |>
# Group and generate the max and min values for each variable
group_by(variable = skim_variable) |>
summarize(
min_val = min(numeric.p0),
max_val = max(numeric.p100)
|>
) # calculate the num of digits in the maximum values
mutate(
max_digits = nchar(as.character(max_val))
|>
) # add variable labels and relocate it after variable name
bind_cols(vlabel = c("cluster number", "household number", "Persons line number")) |>
relocate(vlabel, .after = 1)
# Convert the tibble to flextable for easy viewing
|>
bdpr1_pre_tmp3 qflextable() |>
align(align = "left", part = "all") |>
autofit()
variable | vlabel | min_val | max_val | max_digits |
---|---|---|---|---|
hv001 | cluster number | 1 | 675 | 3 |
hv002 | household number | 1 | 658 | 3 |
hvidx | Persons line number | 1 | 40 | 2 |
The above table gives the required length of the constituent ID variables to be set, so that we can correctly concatenate them to create the ID variables. The required length of the ID variables are given in max_digits column. Note that survey year is also a constituent ID variable of 4-digits.
Checking Family structure variables before harmonization
Here we check the family structure related variables before harmonizing them. The variable names were collected by manually checking the full data dictionaries. Here we will check the data dictionary of these hh-level variables and focus on the variable types.
# We check the family structure vars in all bdpr datasets.
# First we create the data dictionary in nested tibble.
<- bdpr1_pre_tmp0 |>
bdpr1_pre_tmp1 mutate(lookfor_famstrvars = map(bdpr_data, \(df) {
|>
df # select the common independent variables
select(c(hv101, hv102, hv103, hv104, hv105)) |>
lookfor(details = "full") |>
select(-c(levels:n_na)) |>
# For correctly viewing the range column in data dictionary
convert_list_columns_to_character()
})) bdpr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdpr1_pre_tmp1 |>
bdpr1_pre_tmp2 select(c(svy_year, lookfor_famstrvars)) |>
unnest(cols = c(lookfor_famstrvars)) |>
arrange(pos, svy_year)
# Convert the tibble to flextable for easy viewing
|>
bdpr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
svy_year | pos | variable | label | col_type | missing | unique_values | range |
---|---|---|---|---|---|---|---|
1993 | 1 | hv101 | relationship to head | dbl+lbl | 10 | 12 | 1 - 12 |
1996 | 1 | hv101 | relationship to head | dbl+lbl | 5 | 12 | 1 - 12 |
1999 | 1 | hv101 | relationship to head | dbl+lbl | 6 | 13 | 1 - 98 |
2004 | 1 | hv101 | relationship to head | dbl+lbl | 5 | 12 | 1 - 12 |
2007 | 1 | hv101 | relationship to head | dbl+lbl | 0 | 12 | 1 - 99 |
2011 | 1 | hv101 | relationship to head | dbl+lbl | 0 | 11 | 1 - 12 |
2014 | 1 | hv101 | relationship to head | dbl+lbl | 0 | 11 | 1 - 12 |
2017 | 1 | hv101 | relationship to head | dbl+lbl | 0 | 11 | 1 - 12 |
2022 | 1 | hv101 | relationship to head | dbl+lbl | 0 | 12 | 1 - 98 |
1993 | 2 | hv102 | usual resident | dbl+lbl | 0 | 2 | 0 - 1 |
1996 | 2 | hv102 | usual resident | dbl+lbl | 0 | 2 | 0 - 1 |
1999 | 2 | hv102 | usual resident | dbl+lbl | 23 | 3 | 0 - 1 |
2004 | 2 | hv102 | usual resident | dbl+lbl | 3 | 3 | 0 - 1 |
2007 | 2 | hv102 | usual resident | dbl+lbl | 0 | 3 | 0 - 9 |
2011 | 2 | hv102 | usual resident | dbl+lbl | 0 | 2 | 0 - 1 |
2014 | 2 | hv102 | usual resident | dbl+lbl | 0 | 2 | 0 - 1 |
2017 | 2 | hv102 | usual resident | dbl+lbl | 0 | 2 | 0 - 1 |
2022 | 2 | hv102 | usual resident | dbl+lbl | 0 | 2 | 0 - 1 |
1993 | 3 | hv103 | slept last night | dbl+lbl | 0 | 2 | 0 - 1 |
1996 | 3 | hv103 | slept last night | dbl+lbl | 7 | 3 | 0 - 1 |
1999 | 3 | hv103 | slept last night | dbl+lbl | 34 | 3 | 0 - 1 |
2004 | 3 | hv103 | slept last night | dbl+lbl | 3 | 3 | 0 - 1 |
2007 | 3 | hv103 | slept last night | dbl+lbl | 0 | 3 | 0 - 9 |
2011 | 3 | hv103 | slept last night | dbl+lbl | 0 | 2 | 0 - 1 |
2014 | 3 | hv103 | slept last night | dbl+lbl | 0 | 2 | 0 - 1 |
2017 | 3 | hv103 | slept last night | dbl+lbl | 0 | 2 | 0 - 1 |
2022 | 3 | hv103 | stayed last night | dbl+lbl | 0 | 2 | 0 - 1 |
1993 | 4 | hv104 | sex of household member | dbl+lbl | 0 | 2 | 1 - 2 |
1996 | 4 | hv104 | sex of household member | dbl+lbl | 0 | 2 | 1 - 2 |
1999 | 4 | hv104 | sex of household member | dbl+lbl | 4 | 3 | 1 - 2 |
2004 | 4 | hv104 | sex of household member | dbl+lbl | 0 | 2 | 1 - 2 |
2007 | 4 | hv104 | sex of household member | dbl+lbl | 0 | 2 | 1 - 2 |
2011 | 4 | hv104 | sex of household member | dbl+lbl | 0 | 2 | 1 - 2 |
2014 | 4 | hv104 | sex of household member | dbl+lbl | 0 | 2 | 1 - 2 |
2017 | 4 | hv104 | sex of household member | dbl+lbl | 0 | 2 | 1 - 2 |
2022 | 4 | hv104 | sex of household member | dbl+lbl | 0 | 2 | 1 - 2 |
1993 | 5 | hv105 | age of household members | dbl+lbl | 3 | 98 | 0 - 98 |
1996 | 5 | hv105 | age of household members | dbl+lbl | 16 | 97 | 0 - 98 |
1999 | 5 | hv105 | age of household members | dbl+lbl | 29 | 96 | 0 - 98 |
2004 | 5 | hv105 | age of household members | dbl+lbl | 8 | 99 | 0 - 98 |
2007 | 5 | hv105 | age of household members | dbl+lbl | 0 | 98 | 0 - 99 |
2011 | 5 | hv105 | age of household members | dbl+lbl | 4 | 98 | 0 - 96 |
2014 | 5 | hv105 | age of household members | dbl+lbl | 6 | 98 | 0 - 98 |
2017 | 5 | hv105 | age of household members | dbl+lbl | 0 | 94 | 0 - 95 |
2022 | 5 | hv105 | age of household members | dbl+lbl | 0 | 96 | 0 - 95 |
The above table gives an overall snapshot of the family structure related variables. Interestingly, all the variables including age of hh members (a continuous var) are of labelled class. The relation to head and de facto resident variables have few missing values in bdpr 1996. Note that, the three variables of interest hv101-hv102, two variables hv101 and hv103 have different number of value labels across the bdpr rounds. Next, we compare the value labels of the individual variables across the bdpr datasets.
hv101 - Relationship to head
Next, we check the value labels of the relationship to the household head variable. First we create a nested tibble of the value labels.
# Create the data dictionary in nested tibble
<- bdpr1_pre_tmp0 |>
bdpr1_pre_tmp1 mutate(lookfor_hv101 = map(bdpr_data, \(df) {
|>
df select(hv101) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
})) bdpr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdpr1_pre_tmp1 |>
bdpr1_pre_tmp2 # First we select the required cols and unnest()
select(c(ctr_name, svy_year, lookfor_hv101)) |>
unnest(cols = c(lookfor_hv101)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdpr_"
|>
) # Show the variable name in a col
mutate(var_name = "hv101", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdpr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdpr_1993 | bdpr_1996 | bdpr_1999 | bdpr_2004 | bdpr_2007 | bdpr_2011 | bdpr_2014 | bdpr_2017 | bdpr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | hv101 | 1 | [1] head | [1] head | [1] head | [1] head | [1] head | [1] head | [1] head | [1] head | [1] head |
Bangladesh | hv101 | 2 | [2] wife or husband | [2] wife or husband | [2] wife or husband | [2] wife or husband | [2] wife or husband | [2] wife or husband | [2] wife or husband | [2] wife or husband | [2] wife or husband |
Bangladesh | hv101 | 3 | [3] son/daughter | [3] son/daughter | [3] son/daughter | [3] son/daughter | [3] son/daughter | [3] son/daughter | [3] son/daughter | [3] son/daughter | [3] son/daughter |
Bangladesh | hv101 | 4 | [4] son/daughter-in-law | [4] son/daughter-in-law | [4] son/daughter-in-law | [4] son/daughter-in-law | [4] son/daughter-in-law | [4] son/daughter-in-law | [4] son/daughter-in-law | [4] son/daughter-in-law | [4] son/daughter-in-law |
Bangladesh | hv101 | 5 | [5] grandchild | [5] grandchild | [5] grandchild | [5] grandchild | [5] grandchild | [5] grandchild | [5] grandchild | [5] grandchild | [5] grandchild |
Bangladesh | hv101 | 6 | [6] parent | [6] parent | [6] parent | [6] parent | [6] parent | [6] parent | [6] parent | [6] parent | [6] parent |
Bangladesh | hv101 | 7 | [7] parent-in-law | [7] parent-in-law | [7] parent-in-law | [7] parent-in-law | [7] parent-in-law | [7] parent-in-law | [7] parent-in-law | [7] parent-in-law | [7] parent-in-law |
Bangladesh | hv101 | 8 | [8] brother/sister | [8] brother/sister | [8] brother/sister | [8] brother/sister | [8] brother/sister | [8] brother/sister | [8] brother/sister | [8] brother/sister | [8] brother/sister |
Bangladesh | hv101 | 9 | [9] co-spouse | [9] co-spouse | [9] co-spouse | [9] co-spouse | [9] co-spouse | [9] co-spouse | [9] co-spouse | [9] co-spouse | [9] co-spouse |
Bangladesh | hv101 | 10 | [10] other relative | [10] other relative | [10] other relative | [10] other relative | [10] other relative | [10] other relative | [10] other relative | [10] other relative | [10] other relative |
Bangladesh | hv101 | 11 | [11] adopted/foster child | [11] adopted/foster child | [11] adopted/foster child | [11] adopted/foster child | [11] adopted/foster child | [11] adopted/foster child | [11] adopted/foster child | [11] adopted/foster child | [11] adopted/foster child |
Bangladesh | hv101 | 12 | [12] not related | [12] not related | [12] not related | [12] not related | [12] not related | [12] not related | [12] not related | [12] not related | [12] not related |
Bangladesh | hv101 | 13 | [13] niece/nephew by blood | [13] niece/nephew by blood | [13] niece/nephew by blood | [13] niece/nephew by blood | [13] niece/nephew by blood | ||||
Bangladesh | hv101 | 14 | [14] niece/nephew by marriage | [14] niece/nephew by marriage | [14] niece/nephew by marriage | [14] niece/nephew by marriage | [14] niece/nephew by marriage | ||||
Bangladesh | hv101 | 98 | [98] dk | [98] dk | [98] dk | [98] dk | [98] dk | [98] don't know | [98] don't know | [98] don't know | [98] don't know |
The above table shows that the value label texts vary across the bdpr rounds. To harmonize the relationship to head variable we will use the following value labels -
- 1 head
- 2 spouse
- 3 child
- 4 child-in-law
- 5 grandchild
- 6 parent
- 7 parent-in-law
- 8 sibling
- 9 others
Here, we merge the “spouse” and “co-spouse” categories into “spouse” category, and the “son/daughter” and “adopted/foster child” categories into “child” category.
hv102 - de jure/usual resident
Next, we check the value labels of the de jure resident variable. This means if a household member is an usual resident of the household. First we create a nested tibble of the value labels.
# Create the data dictionary in nested tibble
<- bdpr1_pre_tmp0 |>
bdpr1_pre_tmp1 mutate(lookfor_hv102 = map(bdpr_data, \(df) {
|>
df select(hv102) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
})) bdpr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdpr1_pre_tmp1 |>
bdpr1_pre_tmp2 # First we select the required cols and unnest()
select(c(ctr_name, svy_year, lookfor_hv102)) |>
unnest(cols = c(lookfor_hv102)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdpr_"
|>
) # Show the variable name in a col
mutate(var_name = "hv102", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdpr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdpr_1993 | bdpr_1996 | bdpr_1999 | bdpr_2004 | bdpr_2007 | bdpr_2011 | bdpr_2014 | bdpr_2017 | bdpr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | hv102 | 0 | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no |
Bangladesh | hv102 | 1 | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes |
The above table shows that hv102 has the same value label texts and codes across the bdpr rounds. Therefore, we can use this variable directly after converting to factor type.
hv103 - de facto resident
Next, we check the value labels of the de facto resident variable. In DHS this means if a household member slept last night in the household. First we create a nested tibble of the value labels.
# Create the data dictionary in nested tibble
<- bdpr1_pre_tmp0 |>
bdpr1_pre_tmp1 mutate(lookfor_hv103 = map(bdpr_data, \(df) {
|>
df select(hv103) |>
look_for() |>
lookfor_to_long_format() |>
select(value_labels)
})) bdpr1_pre_tmp1
# Now we unnest the tibble and refine the pooled data dictionary
<- bdpr1_pre_tmp1 |>
bdpr1_pre_tmp2 # First we select the required cols and unnest()
select(c(ctr_name, svy_year, lookfor_hv103)) |>
unnest(cols = c(lookfor_hv103)) |>
# Next we make the num of value labels same across each round
mutate(label_num = parse_number(value_labels)) |>
complete(ctr_name, svy_year, label_num) |>
# Next we create col of value labels for each survey round
pivot_wider(
names_from = svy_year,
values_from = value_labels,
names_prefix = "bdpr_"
|>
) # Show the variable name in a col
mutate(var_name = "hv103", .before = 2)
# Convert the tibble to flextable for easy viewing
|>
bdpr1_pre_tmp2 qflextable() |>
align(align = "left", part = "all") |>
autofit()
ctr_name | var_name | label_num | bdpr_1993 | bdpr_1996 | bdpr_1999 | bdpr_2004 | bdpr_2007 | bdpr_2011 | bdpr_2014 | bdpr_2017 | bdpr_2022 |
---|---|---|---|---|---|---|---|---|---|---|---|
Bangladesh | hv103 | 0 | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no | [0] no |
Bangladesh | hv103 | 1 | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes | [1] yes |
The above table shows that hv103 has the same value label texts and codes across the bdpr rounds. Therefore, we can use this variable directly after converting to factor type.
START FROM HERE
TASK:
- Handling multiple births in death scarring vars may not be necessary.
- Preceding birth interval construction has changed with DHS-7. We could re-construct it.
TO BE CONTINUED …