# R code used in Netflix analyses - Oct 25, 2007
# by Russ Lenth
#
# These are grouped in broad categories


#####################################################################
# Functions used to read movie files and slice-dice them into       #
# customer files                                                    #
#####################################################################

# OLD version of fcn to read a movie file as provided by Netflix
# (and compressed using gzip)
#
# read.movie = function (movieno) {
#   fname = sprintf("/space/yoyo/data/Netflix/training_set/mv_%07d.txt.gz", 
#                   movieno)
#   con = gzfile(fname, "rb")
#   lst = scan( con, skip=1, sep=",", 
#         what = list(cust=0, rating=0, date="") )
#   close(con)
#   lst$date = as.Date(lst$date)
#   lst
# }


# Slave-node fcn used to read a Netflix-provided file and save it as
# an .RData file to create initial files for the parSD algorithm.
# The call to read.movie here is to the OLD version of read.movie.
# A modified version of this was used later on to make new mv_....RData
# files, saving considerable time reading movie files.
`makeRData` <-
function(movieno) {
  attach(read.movie(movieno))
  fname = sprintf("cu_-%07d.RData", movieno)
  movie = 0*rating + movieno
  save(list=c("cust","movie","rating","date"), file=fname)
  detach()
  invisible(NULL)
}

# This fcn takes a filename pattern and determines which movies
# are included; then calls makeRData for those movies.
`makeSeveral` <-
function (pattern) {
  files = dir (pattern = pattern)
  mno = as.integer(substr(files, 4, 10))
  sapply(mno, makeRData)
  invisible(NULL)
}

# Slave-node fcn for implementing one step of the slice-dice algorithm
# This combines the files that match the given pattern, and splits
# the data up according to one digit of the customer ID
# We figure out which digit based on lengths of strings in "fnparts"
`decimate` <-
function(pattern) {
  files = dir (pattern = pattern)
  fnparts = unlist(strsplit(pattern, "_|-"))  ### "cu", "CC...", "MM...."
  d = 1000000
  ndig = nchar(fnparts[2])
  if (ndig > 0) for (i in 1:ndig) d = d / 10
  n = length(files)
  if (n == 0) return()
  C = M = R = D = NULL
  for (i in 1:n) {
    load(files[i])
    C = c(C, cust)
    M = c(M, movie)
    R = c(R, rating)
    D = c(D, date)
  }
  custcat = C %/% d
  ucust = unique(custcat)
  for (uc in ucust) {
    filename = paste("cu_", fnparts[2], uc%%10, "-", fnparts[3], 
      ".RData", sep="")
    w = which(custcat == uc)
    cust = C[w]
    movie = M[w]
    rating = R[w]
    date = D[w]
    save(list=c("cust","movie","rating","date"), file=filename)
  }
  sapply(files, file.remove) ## No longer need these files
  invisible(NULL)
}

# Master-node function to set up the files needed by the parSD
# algorithm.  At that time, the needed R fcns were in a file named
# Rfcns.R.  Those functions are now included in this document.
`parNFSetup` <-
function(cl) {
  dir = getwd()
  clusterCall(cl, setwd, dir)
  clusterCall(cl, source, "/space/yoyo/data/Netflix/Rfcns.R")
  files = dir(pattern = "mv_")
  pats = unique(substr(files, 1, 8))
  cat(paste("Farming out the job for", length(pats), "patterns...\n"))
  parLapply(cl, pats, makeSeveral)
}

# Master-node function for doing one cycle of the parSD algorithm
# It uses a directory listing of the current customer files and farms
# the work out to apply decimate to small sets of files.
`parSD` <-
function(cl) {
  files = dir(pattern = "cu_")
  fnparts = matrix (unlist(strsplit(files, "-")), nrow=2)
  sfx = unique(fnparts[2,])
  if (length(sfx) == 1) {
    sapply(fnparts[1,], function(pfx)
      file.rename(paste(pfx,sfx,sep="-"), paste(pfx,"RData",sep="."))
    )
    cat("No more slicing/dicing is necessary.  Files have been renamed\n")
    return(invisible())
  }
  pats = unique(substr(files, 1, 10))  # chops off last digit

  parLapply(cl, pats, decimate)

  cat(paste("We processed", length(files), "files in", length(pats), "patterns.\n"))
}

# I used this to list a few of the files after each stage of parSD
`peek` <-
function(pattern = "cu_") {
  files = dir(pattern = pattern)
  n = length(files)
  cat(paste("We have", n, "files in all...\n"))
  if(n <= 20)
    files
  else
    c(files[1:10],"...",files[(n-9):n])
}


# NEW version of function to read a movie file saved in R format
`read.movie` <-
function (movieno) 
{
  fname = sprintf("%s/mv_%07d.RData", NFpath, movieno)
  load(fname, envir=parent.frame())
}


#####################################################################
# Functions used for summarizing data                               #
#####################################################################

# Slave-node function for computing a customer summary
`cu.summ` <-
function(file) {
  load(paste(NFpath,file,sep="/"))
  tapply(rating, cust, function(r) c(length(r), mean(r), sd(r)))
}

# Slave-node fcn for summarizing a movie
`mv.summ` <-
function(movieno) {
  dat = read.movie(movieno)
  c(length(dat$rating), mean(dat$rating), sd(dat$rating)) 
}

# Slave-node fcn for computing a date trend for each movie
`date.trend` <-
function(movieno, factor=20000) {
  read.movie(movieno)
  ridge = factor*length(rating)
  d.dev = as.integer(date) - mean(as.integer(date))
  365.25 * sum(d.dev*rating) / (ridge + sum(d.dev*d.dev))
}

# Slave-node fcn for determining the mean date for each movie.
# Running this for all movies produced the data vector mean.date
# needed by est.mv,effs and est.cu.effs
`get.mean.date` <-
function(movieno) {
  read.movie(movieno)
  mean(as.integer(date))
}


#####################################################################
# Functions used for model-fitting                                  #
#####################################################################

# These are the objects I need to export to the nodes before
# running any of the parallel code.  That is, I begin each session
# with this call:
#    clusterExport(cl, export.list)
# Some objects are not included in this file...
#    NFpath = "/space/yoyo/data/Netflix/training_set" -- data path
#    read.movie -- listed in this file.
#    mean.date -- created by calling get.mean.date for each movie
#    all.cust -- vector of all 480189 customer IDs
#    cu.pos -- vector of length 2649429.  Maps customer IDs to
#              their indexes in all.cust
`export.list` <-
c("NFpath", "read.movie", "mean.date", "all.cust", "cu.pos")

# Slave-node fcn for estimating customer effects after adjusting
# for movie effects.
`est.cu.effs` <-
function (filename, lambda=0) {
  load(paste(NFpath,filename,sep="/"))
  deff = as.integer(date) - sapply(movie, function(m) mean.date[m])
  deff = deff * sapply(movie, function(m) mv.eff[2,m])
  ydev = rating - 3.6 - deff - sapply(movie, function(m) mv.eff[1,m])
  tapply(ydev, cust, function(e) sum(e) / (lambda + length(e)))
}

# Slave-node fcn for estimating movie effects after adjusting
# for customer effects
`est.mv.effs` <-
function (movieno, lambda0=0, lambda1=0) {
  read.movie(movieno)
  xdev = as.integer(date) - mean.date[movieno]
  ydev = rating - 3.6 - sapply(cust, function(c) cu.eff[cu.pos[c]])
  avg = sum(ydev) / (lambda0 + length(ydev))
  slope = sum(xdev*ydev) / (lambda1 + sum(xdev*xdev))
  c(avg, slope)
}

# Master-node fcn to update the elements of cu.eff based on current
# values of mv_eff.  Lambda is the ridge parameter.  Call with lambda=0
# to get OLS estimates
`update.cu` <-
function(cl, lambda=50) {
  clusterExport(cl, "mv.eff")
  ce = unlist(parLapply(cl, custfiles, est.cu.effs, lambda))
  chg = c(max=max(ce - cu.eff), RMS=sqrt(mean((ce-cu.eff)^2)))
  cu.eff <<- ce
  chg
}

# Master-node fcn to update the elements of mv.eff based on current
# values of cu_eff.  lambda0 and lambda1 are ridge parameters.  
# Call with lambda0 = lambda1 = 0 to get OLS estimates
`update.mv` <-
function(cl, lambda0=300, lambda1=1e6) {
  clusterExport(cl, "cu.eff")
  me = parSapply(cl, 1:17770, est.mv.effs, lambda0, lambda1) 
  chg = c(max.eff = max(abs(me[1,]-mv.eff[1,])),
    RMS.eff = sqrt(mean((me[1,]-mv.eff[1,])^2)), 
    max.slope = max(abs(me[2,]-mv.eff[2,])),
    RMS.slope = sqrt(mean((me[2,]-mv.eff[2,])^2)) ) 
  mv.eff <<- me
  chg
}