### Code for running cross validation and k-fold cross-validation ### import the data datum=read.csv(file.choose()) head(datum) ###### Run standard analysis of model - don't do this if you plan to do cross validation, for information purposes only! results=lm(Size~Age+Sex+MotherSize+FatherSize+Species,data=datum) summary(results) # notice that species and FatherSize are not statistically significant, take them out results=lm(Size~Age+Sex+MotherSize,data=datum) summary(results) # This is the final model we will work with ###### Run standard cross validation ### First partition data into training datasets and testing datasets x=seq(1,length(datum$Size),1) #generate a vector of values representing the number of samples #x training=sample(x,size=length(datum$Size)*0.75,replace=FALSE) #generate a random of list of numbers representing # samples that will be used in the training data set. # numbers not represented will be in the testing dataset # In this case, 75% of data is being sampled for training dataset train datumTrain=datum[training,] #This will be the training dataset head(datumTrain) datumTest=datum[-training,] #This will be the testing dataset head(datumTest) ### Now run the analysis on the training dataset results=lm(Size~Age+Sex+MotherSize,data=datumTrain) #This is the chosen final model #some sort of model building procedure like stepwise regression could be part of this step summary(results) #results=lm(Size~Age+Sex+MotherSize+FatherSize+Species,data=datumTrain) #summary(results) #Full model just for informational purposes only ### Now test the predictive ability of model #Here we use the betas from the model generated with the training data set #and make y-predictions using those betas and the x-values from the TESTING dataset predictions=predict(results,datumTest) #Be sure to use x-values from testing dataset here predictions #Now we compare the y-predictions to the actual y-values in the testing dataset validation=lm(predictions~datumTest$Size) summary(validation) #R^2 and Residual standard error (also called RMSE) are good measures of model performance plot(predictions~datumTest$Size) ###### Run k-fold cross validation ### Uses the 'caret' package library(caret) ### run function 'train' in caret package help(train) fitControl=trainControl(method="repeatedcv",repeats=1) #'method' in this case is for k-fold validation; # default is 10 folds, if you want a different number, use 'number = x', where x is the number of folds you want # repeats is the number of repeated (bootstrapped) k-folds you want to run CV=train(Size~Age+Sex+MotherSize,data=datum,method="lm",trControl=fitControl) #method is the function that you would normally use #train will work with over 200 different methods! ### Check out the results CV$results #RMSE is the average residual standard error between predicted y and actual y from testing data set over the 10 folds #Rsquared is the average r^2 between predicted y and actual y from testing data set over the 10 folds #MAE is the mean absolute error between predicted y and actual y from testing dataset over the 10 folds #RMSESD, RsquaredSD, and MAESD are standard deviation of above over the 10 folds CV$resample #shows results for each of the 10 folds CV$finalModel #These are the optimal betas that results in maximal predictive power ### Note that RMSE, Rsquared may not be provided for other 'methods', as they may not be appropriate #(or even possible).