# R code used in Netflix analyses - Oct 25, 2007 # by Russ Lenth # # These are grouped in broad categories ##################################################################### # Functions used to read movie files and slice-dice them into # # customer files # ##################################################################### # OLD version of fcn to read a movie file as provided by Netflix # (and compressed using gzip) # # read.movie = function (movieno) { # fname = sprintf("/space/yoyo/data/Netflix/training_set/mv_%07d.txt.gz", # movieno) # con = gzfile(fname, "rb") # lst = scan( con, skip=1, sep=",", # what = list(cust=0, rating=0, date="") ) # close(con) # lst$date = as.Date(lst$date) # lst # } # Slave-node fcn used to read a Netflix-provided file and save it as # an .RData file to create initial files for the parSD algorithm. # The call to read.movie here is to the OLD version of read.movie. # A modified version of this was used later on to make new mv_....RData # files, saving considerable time reading movie files. `makeRData` <- function(movieno) { attach(read.movie(movieno)) fname = sprintf("cu_-%07d.RData", movieno) movie = 0*rating + movieno save(list=c("cust","movie","rating","date"), file=fname) detach() invisible(NULL) } # This fcn takes a filename pattern and determines which movies # are included; then calls makeRData for those movies. `makeSeveral` <- function (pattern) { files = dir (pattern = pattern) mno = as.integer(substr(files, 4, 10)) sapply(mno, makeRData) invisible(NULL) } # Slave-node fcn for implementing one step of the slice-dice algorithm # This combines the files that match the given pattern, and splits # the data up according to one digit of the customer ID # We figure out which digit based on lengths of strings in "fnparts" `decimate` <- function(pattern) { files = dir (pattern = pattern) fnparts = unlist(strsplit(pattern, "_|-")) ### "cu", "CC...", "MM...." d = 1000000 ndig = nchar(fnparts[2]) if (ndig > 0) for (i in 1:ndig) d = d / 10 n = length(files) if (n == 0) return() C = M = R = D = NULL for (i in 1:n) { load(files[i]) C = c(C, cust) M = c(M, movie) R = c(R, rating) D = c(D, date) } custcat = C %/% d ucust = unique(custcat) for (uc in ucust) { filename = paste("cu_", fnparts[2], uc%%10, "-", fnparts[3], ".RData", sep="") w = which(custcat == uc) cust = C[w] movie = M[w] rating = R[w] date = D[w] save(list=c("cust","movie","rating","date"), file=filename) } sapply(files, file.remove) ## No longer need these files invisible(NULL) } # Master-node function to set up the files needed by the parSD # algorithm. At that time, the needed R fcns were in a file named # Rfcns.R. Those functions are now included in this document. `parNFSetup` <- function(cl) { dir = getwd() clusterCall(cl, setwd, dir) clusterCall(cl, source, "/space/yoyo/data/Netflix/Rfcns.R") files = dir(pattern = "mv_") pats = unique(substr(files, 1, 8)) cat(paste("Farming out the job for", length(pats), "patterns...\n")) parLapply(cl, pats, makeSeveral) } # Master-node function for doing one cycle of the parSD algorithm # It uses a directory listing of the current customer files and farms # the work out to apply decimate to small sets of files. `parSD` <- function(cl) { files = dir(pattern = "cu_") fnparts = matrix (unlist(strsplit(files, "-")), nrow=2) sfx = unique(fnparts[2,]) if (length(sfx) == 1) { sapply(fnparts[1,], function(pfx) file.rename(paste(pfx,sfx,sep="-"), paste(pfx,"RData",sep=".")) ) cat("No more slicing/dicing is necessary. Files have been renamed\n") return(invisible()) } pats = unique(substr(files, 1, 10)) # chops off last digit parLapply(cl, pats, decimate) cat(paste("We processed", length(files), "files in", length(pats), "patterns.\n")) } # I used this to list a few of the files after each stage of parSD `peek` <- function(pattern = "cu_") { files = dir(pattern = pattern) n = length(files) cat(paste("We have", n, "files in all...\n")) if(n <= 20) files else c(files[1:10],"...",files[(n-9):n]) } # NEW version of function to read a movie file saved in R format `read.movie` <- function (movieno) { fname = sprintf("%s/mv_%07d.RData", NFpath, movieno) load(fname, envir=parent.frame()) } ##################################################################### # Functions used for summarizing data # ##################################################################### # Slave-node function for computing a customer summary `cu.summ` <- function(file) { load(paste(NFpath,file,sep="/")) tapply(rating, cust, function(r) c(length(r), mean(r), sd(r))) } # Slave-node fcn for summarizing a movie `mv.summ` <- function(movieno) { dat = read.movie(movieno) c(length(dat$rating), mean(dat$rating), sd(dat$rating)) } # Slave-node fcn for computing a date trend for each movie `date.trend` <- function(movieno, factor=20000) { read.movie(movieno) ridge = factor*length(rating) d.dev = as.integer(date) - mean(as.integer(date)) 365.25 * sum(d.dev*rating) / (ridge + sum(d.dev*d.dev)) } # Slave-node fcn for determining the mean date for each movie. # Running this for all movies produced the data vector mean.date # needed by est.mv,effs and est.cu.effs `get.mean.date` <- function(movieno) { read.movie(movieno) mean(as.integer(date)) } ##################################################################### # Functions used for model-fitting # ##################################################################### # These are the objects I need to export to the nodes before # running any of the parallel code. That is, I begin each session # with this call: # clusterExport(cl, export.list) # Some objects are not included in this file... # NFpath = "/space/yoyo/data/Netflix/training_set" -- data path # read.movie -- listed in this file. # mean.date -- created by calling get.mean.date for each movie # all.cust -- vector of all 480189 customer IDs # cu.pos -- vector of length 2649429. Maps customer IDs to # their indexes in all.cust `export.list` <- c("NFpath", "read.movie", "mean.date", "all.cust", "cu.pos") # Slave-node fcn for estimating customer effects after adjusting # for movie effects. `est.cu.effs` <- function (filename, lambda=0) { load(paste(NFpath,filename,sep="/")) deff = as.integer(date) - sapply(movie, function(m) mean.date[m]) deff = deff * sapply(movie, function(m) mv.eff[2,m]) ydev = rating - 3.6 - deff - sapply(movie, function(m) mv.eff[1,m]) tapply(ydev, cust, function(e) sum(e) / (lambda + length(e))) } # Slave-node fcn for estimating movie effects after adjusting # for customer effects `est.mv.effs` <- function (movieno, lambda0=0, lambda1=0) { read.movie(movieno) xdev = as.integer(date) - mean.date[movieno] ydev = rating - 3.6 - sapply(cust, function(c) cu.eff[cu.pos[c]]) avg = sum(ydev) / (lambda0 + length(ydev)) slope = sum(xdev*ydev) / (lambda1 + sum(xdev*xdev)) c(avg, slope) } # Master-node fcn to update the elements of cu.eff based on current # values of mv_eff. Lambda is the ridge parameter. Call with lambda=0 # to get OLS estimates `update.cu` <- function(cl, lambda=50) { clusterExport(cl, "mv.eff") ce = unlist(parLapply(cl, custfiles, est.cu.effs, lambda)) chg = c(max=max(ce - cu.eff), RMS=sqrt(mean((ce-cu.eff)^2))) cu.eff <<- ce chg } # Master-node fcn to update the elements of mv.eff based on current # values of cu_eff. lambda0 and lambda1 are ridge parameters. # Call with lambda0 = lambda1 = 0 to get OLS estimates `update.mv` <- function(cl, lambda0=300, lambda1=1e6) { clusterExport(cl, "cu.eff") me = parSapply(cl, 1:17770, est.mv.effs, lambda0, lambda1) chg = c(max.eff = max(abs(me[1,]-mv.eff[1,])), RMS.eff = sqrt(mean((me[1,]-mv.eff[1,])^2)), max.slope = max(abs(me[2,]-mv.eff[2,])), RMS.slope = sqrt(mean((me[2,]-mv.eff[2,])^2)) ) mv.eff <<- me chg }