### Code for running cross validation and k-fold cross-validation 


### import the data


datum=read.csv(file.choose())
head(datum)


###### Run standard analysis of model - don't do this if you plan to do cross validation, for information purposes only!


results=lm(Size~Age+Sex+MotherSize+FatherSize+Species,data=datum)
summary(results)
# notice that species and FatherSize are not statistically significant, take them out

results=lm(Size~Age+Sex+MotherSize,data=datum)
summary(results)
# This is the final model we will work with 


###### Run standard cross validation


### First partition data into training datasets and testing datasets


x=seq(1,length(datum$Size),1) #generate a vector of values representing the number of samples
#x

training=sample(x,size=length(datum$Size)*0.75,replace=FALSE) #generate a random of list of numbers representing 
# samples that will be used in the training data set.
# numbers not represented will be in the testing dataset
# In this case, 75% of data is being sampled for training dataset
train

datumTrain=datum[training,] #This will be the training dataset
head(datumTrain)

datumTest=datum[-training,] #This will be the testing dataset
head(datumTest)


### Now run the analysis on the training dataset


results=lm(Size~Age+Sex+MotherSize,data=datumTrain) #This is the chosen final model
#some sort of model building procedure like stepwise regression could be part of this step
summary(results)

#results=lm(Size~Age+Sex+MotherSize+FatherSize+Species,data=datumTrain)
#summary(results)
#Full model just for informational purposes only


### Now test the predictive ability of model


#Here we use the betas from the model generated with the training data set
#and make y-predictions using those betas and the x-values from the TESTING dataset
predictions=predict(results,datumTest) #Be sure to use x-values from testing dataset here
predictions

#Now we compare the y-predictions to the actual y-values in the testing dataset
validation=lm(predictions~datumTest$Size)
summary(validation)
#R^2 and Residual standard error (also called RMSE) are good measures of model performance
plot(predictions~datumTest$Size)


###### Run k-fold cross validation


### Uses the 'caret' package


library(caret)


### run function 'train' in caret package


help(train)
fitControl=trainControl(method="repeatedcv",repeats=1) 
#'method' in this case is for k-fold validation; 
# default is 10 folds, if you want a different number, use 'number = x', where x is the number of folds you want
# repeats is the number of repeated (bootstrapped) k-folds you want to run

CV=train(Size~Age+Sex+MotherSize,data=datum,method="lm",trControl=fitControl)
#method is the function that you would normally use
#train will work with over 200 different methods!


### Check out the results


CV$results
#RMSE is the average residual standard error between predicted y and actual y from testing data set over the 10 folds
#Rsquared is the average r^2 between predicted y and actual y from testing data set over the 10 folds
#MAE is the mean absolute error between predicted y and actual y from testing dataset over the 10 folds
#RMSESD, RsquaredSD, and MAESD are standard deviation of above over the 10 folds

CV$resample
#shows results for each of the 10 folds

CV$finalModel
#These are the optimal betas that results in maximal predictive power

### Note that RMSE, Rsquared may not be provided for other 'methods', as they may not be appropriate
#(or even possible).