library(dplyr)
library(data.table)
library(moments)

genres <- as.data.frame(movies$genres, stringsAsFactors=FALSE)
genres2 <- as.data.frame(tstrsplit(genres[,1], '[,]', type.convert=TRUE), stringsAsFactors=FALSE)
colnames(genres2) <- c(1:7)

# trim rating ke rating paling awal di answers (2016)
rating_trim <- rating[rating$timestamp >= min(answers$timestamp),]

# rating dari perspektif movie
rating_mov <- rating_trim %>% group_by(movieId) %>% summarise(num_rating = n(), avg_rating = mean(rating))
rating_mov <- as.data.frame(rating_mov)
rating_mov$logpop <- log(rating_mov$num_rating)

# rating dari perspektif user
rating_user <- rating_trim %>% group_by(userId) %>% summarise(count_rating=n(), user_avg_rating=mean(rating))
rating_user <- as.data.frame(rating_user)

# combine dengan rating user untuk mengetahui jumlah rating user dan average
binary_rating_trim <- merge(rating_trim, rating_user, by.x = "userId", by.y = "userId")
# generate binary positive rating dari rating > 3
binary_rating_trim$positive_rating <- with(binary_rating_trim, ifelse(rating > 3, 1, -1))
# generate binary above average rating dari rating > mean rating 
binary_rating_trim$above_average <- with(binary_rating_trim, ifelse(rating > user_avg_rating, 1, -1))

# split genre columns
genre_list <- c('Action','Adventure','Animation','Children','Comedy','Crime','Documentary','Drama','Fantasy','Film-Noir','Horror','IMAX','Musical','Mystery','Romance','Sci-Fi','Thriller','War','Western')

genre_matrix <- matrix(0,49174,19) #empty matrix

colnames(genre_matrix) <- genre_list

for (i in 1:nrow(genres2)) {
  for (c in 1:ncol(genres2)) {
    genmat_col = which(colnames(genre_matrix) == genres2[i,c])
    genre_matrix[i,genmat_col] <- 1
  }
}

#convert into dataframe
genre_matrix2 <- as.data.frame(genre_matrix, stringsAsFactors=FALSE)
for (c in 1:ncol(genre_matrix2)) {
  genre_matrix2[,c] <- as.integer(genre_matrix2[,c])
} #convert from characters to integers

# bentuk dataframe genre rating
## slice dari binary rating trim
genre_rating <- binary_rating_trim[c("userId","movieId","positive_rating","above_average")]
##join dengan genre matrix berdasarkan movie id vs rowname
genre_rating <- merge(genre_rating, genre_matrix2, by.x="movieId", by.y="row.names")
##filter berdasarkan rating positive: positive_rating (>3), dan above_average (>mean_user_rating)
genre_rating_positive <- genre_rating[genre_rating$positive_rating > 0,]
genre_rating_above_avg <- genre_rating[genre_rating$above_average > 0,]
##remove kolom movieId karena tidak dipakai dalam grouping ini
##group by user Id summarize semua genre --> mendapatkan average genre user

## summary genre per user (berdasarkan positive rating, bisa diganti above average)
genre_rating_summary <- genre_rating %>%
  group_by(userId) %>%
  summarise(rated_movie_count = n(), sum_action = sum(Action*positive_rating),sum_adventure = sum(Adventure*positive_rating),sum_animation = sum(Animation*positive_rating), sum_children = sum(Children*positive_rating), sum_comedy = sum(Comedy*positive_rating), sum_crime = sum(Crime*positive_rating), sum_documentary = sum(Documentary*positive_rating), sum_drama = sum(Drama*positive_rating), sum_fantasy = sum(Fantasy*positive_rating), sum_film_noir = sum(`Film-Noir`*positive_rating), sum_horror = sum(Horror*positive_rating), sum_imax = sum(IMAX*positive_rating), sum_musical = sum(Musical*positive_rating), sum_mystery = sum(Mystery*positive_rating), sum_romance = sum(Romance*positive_rating), sum_scifi = sum(`Sci-Fi`*positive_rating), sum_thriller = sum(Thriller*positive_rating), sum_war = sum(War*positive_rating), sum_western = sum(Western*positive_rating))

genre_rating_summary2 <- genre_rating %>%
  group_by(userId) %>%
  summarise(rated_movie_count = n(), sum_action = sum(Action*above_average),sum_adventure = sum(Adventure*above_average),sum_animation = sum(Animation*above_average), sum_children = sum(Children*above_average), sum_comedy = sum(Comedy*above_average), sum_crime = sum(Crime*above_average), sum_documentary = sum(Documentary*above_average), sum_drama = sum(Drama*above_average), sum_fantasy = sum(Fantasy*above_average), sum_film_noir = sum(`Film-Noir`*above_average), sum_horror = sum(Horror*above_average), sum_imax = sum(IMAX*above_average), sum_musical = sum(Musical*above_average), sum_mystery = sum(Mystery*above_average), sum_romance = sum(Romance*above_average), sum_scifi = sum(`Sci-Fi`*above_average), sum_thriller = sum(Thriller*above_average), sum_war = sum(War*above_average), sum_western = sum(Western*above_average))

genre_rating_summary2$genre_avg <- rowMeans(genre_rating_summary2[,c(3:21)])
genre_rating_summary2$genre_std = apply(genre_rating_summary2[,c(3:21)], 1, sd)
genre_rating_summary2$genre_kurtosis = apply(genre_rating_summary2[,c(3:21)], 1, kurtosis)

genre_rating_summary3 <- genre_rating %>%
  group_by(userId) %>%
  summarise(rated_movie_count = n(), sum_action = sum(Action),sum_adventure = sum(Adventure),sum_animation = sum(Animation), sum_children = sum(Children), sum_comedy = sum(Comedy), sum_crime = sum(Crime), sum_documentary = sum(Documentary), sum_drama = sum(Drama), sum_fantasy = sum(Fantasy), sum_film_noir = sum(`Film-Noir`), sum_horror = sum(Horror), sum_imax = sum(IMAX), sum_musical = sum(Musical), sum_mystery = sum(Mystery), sum_romance = sum(Romance), sum_scifi = sum(`Sci-Fi`), sum_thriller = sum(Thriller), sum_war = sum(War), sum_western = sum(Western))

#genre_rating_summary3$genre_avg <- rowMeans(genre_rating_summary3[,c(3:21)])
#genre_rating_summary3$genre_std = apply(genre_rating_summary3[,c(3:21)], 1, sd)

# setelah dapat genre rating summary, nilai dikonversi jadi 0 untuk yang <= 0 dan 1 untuk yang >= 1
# metode yang sama digunakan di paper SIRUP
binary_threshold <- function (x) {ifelse(x<=0,0,1)}
genre_rating_summary_flat <- apply(genre_rating_summary[,c(3:21)],2,binary_threshold)
genre_rating_summary_binary <- as.data.frame(genre_rating_summary)
genre_rating_summary_binary[, colnames(genre_rating_summary_binary) %in% colnames(genre_rating_summary_flat)] <- genre_rating_summary_flat
# kemudian nilai binary tersebut dijumlah ke kanan menandakan total genre yang dirating positif oleh user
genre_rating_summary_binary$total_genres_above3 <- apply(genre_rating_summary_binary[,c(3:21)],1,sum)

#alternatif: above average
genre_rating_summary_flat2 <- apply(genre_rating_summary2[,c(3:21)],2,binary_threshold)
genre_rating_summary_binary2 <- as.data.frame(genre_rating_summary2)
genre_rating_summary_binary2[, colnames(genre_rating_summary_binary2) %in% colnames(genre_rating_summary_flat2)] <- genre_rating_summary_flat2
# kemudian nilai binary tersebut dijumlah ke kanan menandakan total genre yang dirating positif oleh user
genre_rating_summary_binary2$total_genres_above_avg <- apply(genre_rating_summary_binary2[,c(3:21)],1,sum)
genre_rating_summary_binary2 <- genre_rating_summary_binary2[,  c("userId","total_genres_above_avg")]

#alternatif: semua
genre_rating_summary_flat3 <- apply(genre_rating_summary3[,c(3:21)],2,binary_threshold)
genre_rating_summary_binary3 <- as.data.frame(genre_rating_summary3)
genre_rating_summary_binary3[, colnames(genre_rating_summary_binary3) %in% colnames(genre_rating_summary_flat3)] <- genre_rating_summary_flat3
# kemudian nilai binary tersebut dijumlah ke kanan menandakan total genre yang dirating oleh user
genre_rating_summary_binary3$total_genres <- apply(genre_rating_summary_binary3[,c(3:21)],1,sum)

# ambil nrow dan hitung logpop
#rating_3 <- rating %>% group_by(movieId) %>% summarise(num_rating = n())
#rating_3 <- as.data.frame(rating_3)
#rating_3$logpop <- log(rating_3$num_rating)

answers_2 <- answers
answers_2 <- merge(answers, rating_mov, by.x='movieId', by.y='movieId',sort=TRUE)

# Join dengan genre_rating_summary_binary untuk dapat user coping potential (genre_norm)
answers_2 <- merge(answers_2, genre_rating_summary_binary, by.x='userId', by.y='userId', sort=TRUE)
answers_2 <- merge(answers_2, genre_rating_summary_binary2, by.x='userId', by.y='userId', sort=TRUE)
answers_2 <- merge(answers_2, genre_rating_summary_binary3, by.x='userId', by.y='userId', sort=TRUE)
# ambil kolom yang dipakai saja
answers_3 <- answers_2[, names(answers_2) %in% c("userId","movieId","rating","predictedRating","s1","s2","s3","s4","s5","s6","s7","s8","s_ser_rel","s_ser_find","s_ser_imp","s_ser_rec","m_ser_rel","m_ser_find","m_ser_imp","m_ser_rec","prob","rated_movie_count","rated_movie_count","total_genres_above3","total_genres","total_genres_above_avg","logpop")]
# raw values untuk masing2 serendipity metric sesuai pembagian metrik di paper investigating serendipity
answers_3$s_nov <- answers_3$s1 > 3
answers_3$m_nov <- answers_3$s2 > 3
answers_3$unexp_rel <- answers_3$s3 <= 3
answers_3$unexp_find <- answers_3$s4 > 3
answers_3$unexp_imp <- answers_3$s5 > 3
answers_3$unexp_rec <- answers_3$s6 > 3
answers_3$nov <- 1 - answers_3$logpop
answers_3$satisfied <- answers_3$s7 > 3
answers_3$broadened <- answers_3$s8 > 3
# parameter dari answers yang bisa dipakai mungkin predictedRating -> predictor serendipity (Kotkov)

# convert ke factor untuk s1-s8
answers_3 <- within(answers_3, {
  s1 <- factor(s1)
  s2 <- factor(s2)
  s3 <- factor(s3)
  s4 <- factor(s4)
  s5 <- factor(s5)
  s6 <- factor(s6)
  s7 <- factor(s7)
  s8 <- factor(s8)
})

answers_3 <- na.omit(answers_3)
