%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
% Part of the replication package for the paper
%   "Marginal Effects for Probit and Tobit with Endogeneity"
%   by Kirill S. Evdokimov, Ilze Kalnina, and Andrei Zeleneev.
%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

%%
clear
s_input_fname_prefix = "NLSY/nlsy97_raw/nlsy97_v0"; % "v001\nlsy97_v001";
fname_in_raw    = s_input_fname_prefix + ".csv"; % raw
fname_in_labels = s_input_fname_prefix + "-value-labels.do";

flagCleanSimilarToR1 = 1; 

s_extra_flags = "";

fname_out_csv   = "NLSY/nlsy97_empirical.csv";

%% parse RAW data
s0 = fileread(fname_in_labels);
s0 = strrep(s0, '_XRND', '_XR'); % to match Stata names
s0 = strrep(s0, 'KEY!', 'KEY_');

% file nlsy97_v0-value-labels.do contains lines like this:
%   rename E0013928 EMP_STATUS_2019_28_XRND   // EMP_STATUS_2019.28
% which we use to extract variable names
arr_var_renames = regexp(s0, "\n[ ]*rename[ ]+(\w{6,20})[ ]+(\w+)[ ]+", "tokens");
arr_var_renames = string(cell_flatten(arr_var_renames'));
%size(arr_var_renames)

tic
T_raw_0 =  readtable(fname_in_raw);
T_raw_1 = renamevars(T_raw_0, arr_var_renames(:,1), arr_var_renames(:,2));
toc

T_raw_1 = movevars(T_raw_1, "PUBID_1997", 'Before', 1);
T_raw_1 = movevars(T_raw_1, regexpPattern("KEY_\w+"), 'After', 1);

%%
T_raw = T_raw_1(T_raw_1.KEY_SEX_1997==2,:);
T_raw.age = 2018-T_raw.KEY_BDATE_Y_1997;
T_raw.educ = T_raw.CVC_HGC_EVER_XR;
T_raw.educ(T_raw.educ==95) = 0; % 95 = "Ungraded" in the codebook, be careful about these values
T_raw.spouse_inc = T_raw.YINC_2600_2019;
T_raw.wife_inc = T_raw.YINC_1700_2019;
T_raw.wife_had_empl_inc = T_raw.YINC_1400_2019;
T_raw.family_inc = T_raw.CV_INCOME_FAMILY_2019;

fn_XRND_arr = @(T, s_prefix, nn) T{:, arrayfun(@(i)  sprintf(s_prefix + "_%02d_XR", i), 1:nn)};

%% Experience

%  CVC_WKSWK_ADULT2_ET  Experience as an adult (in weeks), starting 20 years of age
T_raw.exper = T_raw.CVC_WKSWK_ADULT2_ET_XR;
ix = T_raw.exper>=0;
T_raw.exper(ix) = T_raw.exper(ix)/52; %exper in years

%%
MAR_STATUS = fn_XRND_arr(T_raw, "MAR_STATUS_2018", 12);
% MAR_STATUS:
%   0 Never Married, Not Cohabitating
%   1 Never Married, Cohabiting
%   2 Married
%   3 Legally Separated
%   4 Divorced
%   5 Widowed

ix_married = all(MAR_STATUS==2,2);

% Spouse Information: parsing HHI_RELY roster

HHI_RELY = T_raw{:, arrayfun(@(i)  sprintf("HHI_RELY_%02d_2019", i), 1:17)};
one_spouse = sum(HHI_RELY==2 | HHI_RELY==1, 2) == 1; %only one spouse recorded
ix_keep = ix_married & one_spouse;
T_raw = T_raw(ix_keep,:);
fprintf('# Married women (one spouse): %d observations\n', nrows(T_raw));

HHI_RELY = T_raw{:, arrayfun(@(i)  sprintf("HHI_RELY_%02d_2019", i), 1:17)};

MAR_COHABITATION = fn_XRND_arr(T_raw, "MAR_COHABITATION_2018", 12);

MAR_PARTNER_LINK = fn_XRND_arr(T_raw, "MAR_PARTNER_LINK_2018", 12);
PARTNERS_CURRENT_01_2019 = T_raw.PARTNERS_CURRENT_01_2019;

PARTNERS_ID_UID = T_raw(:, ["CV_MARSTAT_2019" "MAR_COHABITATION_2018_01_XR" "MAR_PARTNER_LINK_2018_01_XR" "MAR_COHABITATION_2019_01_XR" "MAR_PARTNER_LINK_2019_01_XR" ...
  "PARTNERS_UID_01_2019" "PARTNERS_UID_02_2019" "PARTNERS_UID_03_2019" ...
  "PARTNERS_ID_01_2019" "PARTNERS_ID_02_2019" "PARTNERS_ID_03_2019"]);

nn = nrows(T_raw);
spouse_age      = NaN(nn,1);
spouse_sex      = NaN(nn,1);
spouse_educ     = NaN(nn,1); % Spouse education: extract from HHI_* rosters: HHI_HIGHGRADE_xx_2019  (Very few values: YMAR_3700_01  HIGHEST DEGREE/CERTIFICATE PARTNER 01 RECEIVED)
spouse_empl     = NaN(nn,1);
spouse_inc_2019_hhi  = NaN(nn,1); % Spouse income in 2019 via HHI roster (there is another spouse income variable in the dataset)
spouse_rel      = NaN(nn,1);
for i=1:nn
  i_spouse = find(HHI_RELY(i,:)==2 | HHI_RELY(i,:)==1);
  if ~isempty(i_spouse)
    assert(isscalar(i_spouse));
    spouse_age(i)    = T_raw{i, sprintf("HHI_AGE_%02d_2019"      , i_spouse)};
    spouse_sex(i)    = T_raw{i, sprintf("HHI_SEX_%02d_2019"      , i_spouse)};
    spouse_empl(i)   = T_raw{i, sprintf("HHI_EMPLOYED_%02d_2019"   , i_spouse)};
    sp_monthly_inc = T_raw{i, sprintf("HHI_INCOME_%02d_2019"   , i_spouse)};
    spouse_inc_2019_hhi(i) = qyn(sp_monthly_inc>0, 12*sp_monthly_inc, sp_monthly_inc); 
    spouse_educ(i)   = T_raw{i, sprintf("HHI_HIGHGRADE_%02d_2019", i_spouse)};
    spouse_rel(i)    = T_raw{i, sprintf("HHI_RELY_%02d_2019"     , i_spouse)};
  end
end

T_raw = [T_raw table(spouse_empl, spouse_inc_2019_hhi, spouse_educ, spouse_age, spouse_sex, spouse_rel)];

%%

%  https://www.nlsinfo.org/content/cohorts/nlsy97/topical-guide/employment/work-experience
T_raw.hours = T_raw.CVC_HOURS_WK_YR_ET_18_XR;  % Total Hours Worked in 2018
ix = T_raw.hours>=0;

%% Number of Children: LT6 and GE6

T_raw.kidslt6 = T_raw.CV_HH_UNDER_6_2019;
T_raw.kidsge6 = T_raw.CV_HH_UNDER_18_2019-T_raw.CV_HH_UNDER_6_2019;

% Three observations have "Invalid Skip" flag for the above (one will be dropped later for another reason)
% So we recalculate the these numbers using CV_CHILD_BIRTH_MONTH_ array

% Children: Month of birth, with 1980-Jan being Month 1
ref_month = calmonths(between(datetime("1980-Jan-01"), datetime("2018-Dec-31"))); % compute kids age relative to this month
n = size(T_raw,1);
age_mo = T_raw{:, "CV_CHILD_BIRTH_MONTH_" + [("0" + (1:9))  ("" + (10:13))] + "_2019"};
age_mo_v0 = age_mo;
age_mo(age_mo<0) = NaN;
age_mo = ref_month - age_mo;
kidslt6_v1 = sum(age_mo>=0 & age_mo<6*12, 2, "omitnan");
kidsge6_v1 = sum(age_mo>=6*12 & age_mo<18*12, 2, "omitnan");

ix_neg_kidslt6 = T_raw.kidslt6<0;
ix_neg_kidsge6 = T_raw.kidsge6<0;
fprintf('Fixing "kidslt6" for %d observations, and "kidsge6" for %d observations\n', sum(ix_neg_kidslt6), sum(ix_neg_kidsge6));
T_raw.kidslt6(ix_neg_kidslt6) = kidslt6_v1(ix_neg_kidslt6);
T_raw.kidsge6(ix_neg_kidsge6) = kidsge6_v1(ix_neg_kidsge6);

%% Making data for the analysis: T_out
var_names = T_raw.Properties.VariableNames;
ix = startsWith(var_names, "PUBID_1997" | "CVC_RND_XR" | "KEY_SEX" | "KEY_BDATE" | "KEY_RACE" | "hours" | "exper" | "age" | "family_" | "kids" | "spouse" | "wife" | "educ" | "wife_inc" | "YINC_" );

% Cleaning
T = movevars(T_raw(:, ix), "spouse_inc_2019_hhi", "After", "spouse_inc");

% disp(sum(T.spouse_educ==0:20)); % summary of spouse educ
ix_drop = T.spouse_educ < 0 | T.spouse_educ > 20;
fprintf("# Dropping %4d observations: missing 'spouse_educ'\n", sum(ix_drop));
T(ix_drop,:) = [];

ix_drop = T.exper < 0 | T.educ <= 0 | T.hours < 0;
fprintf("# Dropping %4d observations: exper < 0 | educ <= 0 | hours < 0\n", sum(ix_drop));
T(ix_drop,:) = [];

ix_inconsistent_hours = (T.hours>0)~=T.wife_had_empl_inc;
ix_drop = ix_inconsistent_hours; 
fprintf("# of cases: inconsistent 'hours':   %4d\n", sum(ix_drop));
fprintf("   => Dropping these observations\n");
T(ix_drop,:) = [];
 
ix = T.spouse_inc < 0 & T.spouse_inc_2019_hhi >= 0;
fprintf("# of cases: replaced 'spouse_inc' with 'spouse_inc_2019_hhi' computed via HHI roster:   %4d\n", sum(ix));
fprintf('  (Flagged spouse_inc_imputed=1 for these observations)\n');
T.spouse_inc(ix) = T.spouse_inc_2019_hhi(ix);  % nlsy97_vC.csv used this
T.spouse_inc_imputed = ix;
T.wife_inc(T.wife_inc==-4) = 0;

% constructing nonwife_inc
T.nonwife_inc = NaN(nrows(T),1); 
T = movevars(T, "nonwife_inc", "After", "family_inc");

ix_1 = T.family_inc > 0 & T.wife_inc >= 0;
%T.family_or_wife_inc_topcoded = 
fprintf('# obs with family_inc > 0 & wife_inc >= 0: %d\n', sum(ix_1));

T.nonwife_inc(ix_1) = T.family_inc(ix_1) - T.wife_inc(ix_1);
fprintf('# Constructed nonwife_inc for %d observations\n', sum(ix_1));

% Robustness check: can exclude the following observations as less reliable using "nonwife_inc_imputed" flag
ix_3 = isnan(T.nonwife_inc) & T.spouse_inc > 0; % set nonwife_inc = spouse_inc if couldn't compute above
foo = T(ix_3,:);
T.nonwife_inc(ix_3) = T.spouse_inc (ix_3);
T.nonwife_inc_imputed = ix_3;
fprintf(' ... added nonwife_inc for %d observations as spouse_inc when other info was N/A\n', sum(ix_3));

fprintf('# valid obs: nonwife_inc, spouse_inc:  ');
disp(sum([~isnan(T.nonwife_inc) T.spouse_inc>=0]));

ix_drop_2 = isnan(T.nonwife_inc) & T.spouse_inc <0;
fprintf("# of cases: no valid 'nonwife_inc' or 'spouse_inc':   %4d\n", sum(ix_drop_2));
fprintf('  ... dropping these\n');
T(ix_drop_2,:) = [];

% NB: Mroz data is in 1975 dollars
% BLS deflator 1975-Jun - 2018-Jun is 4.70

%% Save the dataset for data analysis

T_data = T;
T_data.nwifeinc = T_data.nonwife_inc / 1000;
T_data.nwifeinc(isnan(T_data.nwifeinc)) = -9; % flag for missing info

% fprintf('KEY_RACE_ETHNICITY_1997:'); disp(sum(T_data.KEY_RACE_ETHNICITY_1997==(1:4))); % 1 - Black, 2 - Hispanic, 3 - Mixed Race, 4 - Non-Black / Non-Hispanic
T_data.black = T_data.KEY_RACE_ETHNICITY_1997==1;
T_data.hispanic = T_data.KEY_RACE_ETHNICITY_1997==2;
%KEY_RACE_ETHNICITY_1997 == 3 is "Mixed Race (Non-Hispanic)", but there are very few observations

fprintf('In total, the dataset has  %d  observations (some may be excluded by further restrictions)\n', nrows(T));
T_data = T_data(:,["PUBID_1997", "KEY_SEX_1997", "age", "black", "hispanic", "educ", "exper", "hours", "nwifeinc", "spouse_empl", "spouse_educ", "spouse_age", "spouse_sex", "spouse_rel", "spouse_inc", "spouse_inc_imputed", "nonwife_inc_imputed", "kidslt6", "kidsge6"]);
writetable(T_data, fname_out_csv);

