library(tidyverse)
library(ggplot2)
library(stats)
library(dplyr)
library(tidyr)
library(data.table)
library(moments)
library(ggpubr)
library(readxl)
library(plotrix)
amazon_dataset <- read_excel("G:/My Drive/1_SPRING 2021/2_STATISTICS AND ANALYSIS/amazon_dataset.xlsx")
as.data.frame(amazon_dataset)
str(amazon_dataset)
amazon_dataset_clean_title <- amazon_dataset
amazon_dataset_old_names <- colnames(amazon_dataset)
amazon_dataset_names <- c("Uniq_Id","Crawl_Timestamp","Product_Id","Product_Barcode","Product_Company_Type_Source","Retailer","Product_Category_1","Product_Category_2","Product_Category_3","Product_Category_4","Product_Category_5", "Product_Category_6","Product_Category_7", "Product_Brand","Product_Name","Product_Price","Sku","Upc","Product_Url","Market","Product_Description","Product_Currency","Product_Available_Inventory","Product_Image_Url","Product_Model_Number","Product_Tags","Product_Contents","Product_Rating","Product_Reviews_Count","Bsr","Joining_Key")
setnames(amazon_dataset_clean_title, old = amazon_dataset_old_names, new = amazon_dataset_names)
amazon_dataset_clean_title <- drop_na(amazon_dataset_clean_title, "Product_Category_1")
amazon_dataset <- amazon_dataset_clean_title
ad_avg_price <- mean(amazon_dataset$Product_Price, na.rm = TRUE)
print(ad_avg_price)
[1] 41.30683
prod_cats <- unique(amazon_dataset[c("Product_Category_1")])
view(prod_cats)
mean_prod_cat_1 <- amazon_dataset %>%
group_by(product_category = amazon_dataset$Product_Category_1) %>%
summarize(average_price = mean(Product_Price, na.rm = TRUE)) %>%
arrange(product_category)
count_prod_cat_1 <- amazon_dataset %>%
group_by(product_category = amazon_dataset$Product_Category_1) %>%
tally() %>%
arrange(product_category)
names(count_prod_cat_1)[names(count_prod_cat_1) == "n"] <- "product_count"
product_mean_count <- cbind(mean_prod_cat_1, count_prod_cat_1$product_count)
names(product_mean_count)[names(product_mean_count) == "count_prod_cat_1$product_count"] <- "product_count"
product_mean_count <- product_mean_count %>%
arrange(desc(product_count))
view(product_mean_count)
total_obs <- sum(product_mean_count[1:27,3])
print(total_obs)
[1] 29935
total_obs_9 <- sum(product_mean_count[1:9,3])
print(total_obs)
print(total_obs_9)
-THE TOP 9 CATEGORIES REPRESENT 97% OF THE DATASET
percentage_top_9 <- round(total_obs_9/total_obs * 100, digits = 2)
print(percentage_top_9)
[1] 29935 [1] 29292
[1] 97.85
percentages_prod_cat <- round(c((product_mean_count[1,3]/total_obs_9)*100 , (product_mean_count[2,3]/total_obs_9)*100, (product_mean_count[3,3]/total_obs_9)*100, (product_mean_count[4,3]/total_obs_9)*100, (product_mean_count[5,3]/total_obs_9)*100, (product_mean_count[6,3]/total_obs_9)*100, (product_mean_count[7,3]/total_obs_9)*100, (product_mean_count[8,3]/total_obs_9)*100, (product_mean_count[9,3]/total_obs_9)*100), digits = 2)
totals_cat_top_9 <- cbind.data.frame(desired_cats[,1], percentages_prod_cat, product_mean_count[1:9,2] ,product_mean_count[1:9,3])
colnames(totals_cat_top_9) <- c("Product Category", "Percentage", "Mean, Product Price", "Product Count")
print(totals_cat_top_9)
amazon_dataset_1 <- filter(amazon_dataset, Product_Category_1 == "Home & Kitchen")
amazon_dataset_2 <- filter(amazon_dataset, Product_Category_1 == "Health & Household")
amazon_dataset_3 <- filter(amazon_dataset, Product_Category_1 == "Beauty & Personal Care")
amazon_dataset_4 <- filter(amazon_dataset, Product_Category_1 == "Tools & Home Improvement")
amazon_dataset_5 <- filter(amazon_dataset, Product_Category_1 == "Toys & Games")
amazon_dataset_6 <- filter(amazon_dataset, Product_Category_1 == "Baby Products")
amazon_dataset_7 <- filter(amazon_dataset, Product_Category_1 == "Office Products")
amazon_dataset_8 <- filter(amazon_dataset, Product_Category_1 == "Industrial & Scientific")
amazon_dataset_9 <- filter(amazon_dataset, Product_Category_1 == "Clothing, Shoes & Jewelry")
amazon_dataset_1_1 <- as.data.frame(na.omit(amazon_dataset_1$Product_Price))
amazon_dataset_2_1 <- as.data.frame(na.omit(amazon_dataset_2$Product_Price))
amazon_dataset_3_1 <- as.data.frame(na.omit(amazon_dataset_3$Product_Price))
amazon_dataset_4_1 <- as.data.frame(na.omit(amazon_dataset_4$Product_Price))
amazon_dataset_5_1 <- as.data.frame(na.omit(amazon_dataset_5$Product_Price))
amazon_dataset_6_1 <- as.data.frame(na.omit(amazon_dataset_6$Product_Price))
amazon_dataset_7_1 <- as.data.frame(na.omit(amazon_dataset_7$Product_Price))
amazon_dataset_8_1 <- as.data.frame(na.omit(amazon_dataset_8$Product_Price))
amazon_dataset_9_1 <- as.data.frame(na.omit(amazon_dataset_9$Product_Price))
amazon_dataset_1_vector <- as.vector(na.omit(amazon_dataset_1$Product_Price))
amazon_dataset_2_vector <- as.vector(na.omit(amazon_dataset_2$Product_Price))
amazon_dataset_3_vector <- as.vector(na.omit(amazon_dataset_3$Product_Price))
amazon_dataset_4_vector <- as.vector(na.omit(amazon_dataset_4$Product_Price))
amazon_dataset_5_vector <- as.vector(na.omit(amazon_dataset_5$Product_Price))
amazon_dataset_6_vector <- as.vector(na.omit(amazon_dataset_6$Product_Price))
amazon_dataset_7_vector <- as.vector(na.omit(amazon_dataset_7$Product_Price))
amazon_dataset_8_vector <- as.vector(na.omit(amazon_dataset_8$Product_Price))
amazon_dataset_9_vector <- as.vector(na.omit(amazon_dataset_9$Product_Price))
colnames(amazon_dataset_1_1) <- "product_price"
colnames(amazon_dataset_2_1) <- "product_price"
colnames(amazon_dataset_3_1) <- "product_price"
colnames(amazon_dataset_4_1) <- "product_price"
colnames(amazon_dataset_5_1) <- "product_price"
colnames(amazon_dataset_6_1) <- "product_price"
colnames(amazon_dataset_7_1) <- "product_price"
colnames(amazon_dataset_8_1) <- "product_price"
colnames(amazon_dataset_9_1) <- "product_price"
mean1 <- mean(amazon_dataset_1$Product_Price, na.rm = TRUE)
mean2 <- mean(amazon_dataset_2$Product_Price, na.rm = TRUE)
mean3 <- mean(amazon_dataset_3$Product_Price, na.rm = TRUE)
mean4 <- mean(amazon_dataset_4$Product_Price, na.rm = TRUE)
mean5 <- mean(amazon_dataset_5$Product_Price, na.rm = TRUE)
mean6 <- mean(amazon_dataset_6$Product_Price, na.rm = TRUE)
mean7 <- mean(amazon_dataset_7$Product_Price, na.rm = TRUE)
mean8 <- mean(amazon_dataset_8$Product_Price, na.rm = TRUE)
mean9 <- mean(amazon_dataset_9$Product_Price, na.rm = TRUE)
median1 <- median(amazon_dataset_1$Product_Price, na.rm = TRUE)
median2 <- median(amazon_dataset_2$Product_Price, na.rm = TRUE)
median3 <- median(amazon_dataset_3$Product_Price, na.rm = TRUE)
median4 <- median(amazon_dataset_4$Product_Price, na.rm = TRUE)
median5 <- median(amazon_dataset_5$Product_Price, na.rm = TRUE)
median6 <- median(amazon_dataset_6$Product_Price, na.rm = TRUE)
median7 <- median(amazon_dataset_7$Product_Price, na.rm = TRUE)
median8 <- median(amazon_dataset_8$Product_Price, na.rm = TRUE)
median9 <- median(amazon_dataset_9$Product_Price, na.rm = TRUE)
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
mode1 <- getmode(amazon_dataset_1_1)
mode2 <- getmode(amazon_dataset_2_1)
mode3 <- getmode(amazon_dataset_3_1)
mode4 <- getmode(amazon_dataset_4_1)
mode5 <- getmode(amazon_dataset_5_1)
mode6 <- getmode(amazon_dataset_6_1)
mode7 <- getmode(amazon_dataset_7_1)
mode8 <- getmode(amazon_dataset_8_1)
mode9 <- getmode(amazon_dataset_9_1)
mode1 <- as.numeric(unlist(mode1))
mode2 <- as.numeric(unlist(mode2))
mode3 <- as.numeric(unlist(mode3))
mode4 <- as.numeric(unlist(mode4))
mode5 <- as.numeric(unlist(mode5))
mode6 <- as.numeric(unlist(mode6))
mode7 <- as.numeric(unlist(mode7))
mode8 <- as.numeric(unlist(mode8))
mode9 <- as.numeric(unlist(mode9))
min_max_1 <- c(min(amazon_dataset_1_vector), max(amazon_dataset_1_vector))
min_max_2 <- c(min(amazon_dataset_2_vector), max(amazon_dataset_2_vector))
min_max_3 <- c(min(amazon_dataset_3_vector), max(amazon_dataset_3_vector))
min_max_4 <- c(min(amazon_dataset_4_vector), max(amazon_dataset_4_vector))
min_max_5 <- c(min(amazon_dataset_5_vector), max(amazon_dataset_5_vector))
min_max_6 <- c(min(amazon_dataset_6_vector), max(amazon_dataset_6_vector))
min_max_7 <- c(min(amazon_dataset_7_vector), max(amazon_dataset_7_vector))
min_max_8 <- c(min(amazon_dataset_8_vector), max(amazon_dataset_8_vector))
min_max_9 <- c(min(amazon_dataset_9_vector), max(amazon_dataset_9_vector))
price_hist_1 <- ggplot(amazon_dataset_1, aes(x=Product_Price)) + geom_histogram(binwidth = 50)
price_hist_2 <- ggplot(amazon_dataset_2, aes(x=Product_Price)) + geom_histogram(binwidth = 50)
price_hist_3 <- ggplot(amazon_dataset_3, aes(x=Product_Price)) + geom_histogram(binwidth = 50)
price_hist_4 <- ggplot(amazon_dataset_4, aes(x=Product_Price)) + geom_histogram(binwidth = 50)
price_hist_5 <- ggplot(amazon_dataset_5, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_6 <- ggplot(amazon_dataset_6, aes(x=Product_Price)) + geom_histogram(binwidth = 50)
price_hist_7 <- ggplot(amazon_dataset_7, aes(x=Product_Price)) + geom_histogram(binwidth = 50)
price_hist_8 <- ggplot(amazon_dataset_8, aes(x=Product_Price)) + geom_histogram(binwidth = 50)
price_hist_9 <- ggplot(amazon_dataset_9, aes(x=Product_Price)) + geom_histogram(binwidth = 50)
print(price_hist_1 + ggtitle("Product Price, Home & Kitchen") +labs(x = "Product Price", y = "Count"))
print(price_hist_2 + ggtitle("Product Price, Health & Household") +labs(x = "Product Price", y = "Count"))
print(price_hist_3 + ggtitle("Product Price, Beauty & Personal Care") +labs(x = "Product Price", y = "Count"))
print(price_hist_4 + ggtitle("Product Price, Tools & Home Improvement") +labs(x = "Product Price", y = "Count"))
print(price_hist_5 + ggtitle("Product Price, Toys & Games") +labs(x = "Product Price", y = "Count"))
print(price_hist_6 + ggtitle("Product Price, Baby Products") +labs(x = "Product Price", y = "Count"))
print(price_hist_7 + ggtitle("Product Price, Office Products") +labs(x = "Product Price", y = "Count"))
print(price_hist_8 + ggtitle("Product Price, Industrial & Scientific") +labs(x = "Product Price", y = "Count"))
print(price_hist_9 + ggtitle("Product Price, Clothing, Shoes & Jewelry") +labs(x = "Product Price", y = "Count"))
penny_lines <- subset(amazon_dataset_1, amazon_dataset_1$Product_Price <= 0.01)
str(penny_lines)
USED THE FIRST DATAFRAME TO TEST WHAT EDIT DISTANCE I WOULD NEED FROM AGREP IN ORDER TO HAVE THE HIGHEST THRESHOLD OF MATCHES WITHOUT TAKING ON THE ENTIRE DATAFRAME
DETERMINED THAT 0.75 WAS THE BEST EDIT DISTANCE AND THE WORKFLOW NEEDED
I CREATED A COLUMN FOR THE LAST CATEGORY VALUE FOR EACH DATASET IN EXCEL THEN FED IT BACK INTO R
AGREP PRODUCES A LIST OF MATCHES FOUND AT EACH INDEX
amazon_dataset_1_last_cat_value <- read_excel("G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_1_last_cat_value.xlsx")
matching_list <- agrep(amazon_dataset_1_last_cat_value$LAST_CAT_VALUE, amazon_dataset_1_last_cat_value$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
amazon_dataset_1_last_cat_value$match <- "NO"
print(amazon_dataset_1_last_cat_value)
amazon_dataset_1_last_cat_value$match[matching_list] <- "YES"
amazon_dataset_1_match <- amazon_dataset_1_last_cat_value
write.csv(amazon_dataset_1_match, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\amazon_dataset_1_match_prodcon.csv", row.names = FALSE)
write.csv(amazon_dataset_2, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_2.csv", row.names = FALSE)
write.csv(amazon_dataset_3, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_3.csv", row.names = FALSE)
write.csv(amazon_dataset_4, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_4.csv", row.names = FALSE)
write.csv(amazon_dataset_5, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_5.csv", row.names = FALSE)
write.csv(amazon_dataset_6, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_6.csv", row.names = FALSE)
write.csv(amazon_dataset_7, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_7.csv", row.names = FALSE)
write.csv(amazon_dataset_8, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_8.csv", row.names = FALSE)
write.csv(amazon_dataset_9, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_9.csv", row.names = FALSE)
setwd("G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets")
getwd()
all_files <- list.files(path = "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets", pattern = "*.xlsx")
as.vector(all_files)
amazon_dataset_2_lcv <- as.data.frame(lapply(all_files[1], read_xlsx))
amazon_dataset_3_lcv <- as.data.frame(lapply(all_files[2], read_xlsx))
amazon_dataset_4_lcv <- as.data.frame(lapply(all_files[3], read_xlsx))
amazon_dataset_5_lcv <- as.data.frame(lapply(all_files[4], read_xlsx))
amazon_dataset_6_lcv <- as.data.frame(lapply(all_files[5], read_xlsx))
amazon_dataset_7_lcv <- as.data.frame(lapply(all_files[6], read_xlsx))
amazon_dataset_8_lcv <- as.data.frame(lapply(all_files[7], read_xlsx))
amazon_dataset_9_lcv <- as.data.frame(lapply(all_files[8], read_xlsx))
Last Category Value to Product Description
Last Category Value to Product Contents
Product Category 1 to Bsr (product category ranking)
match_list2_1 <- agrep(amazon_dataset_2_lcv$LAST_CAT_VALUE, amazon_dataset_2_lcv$Product_Description, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list2_2 <- agrep(amazon_dataset_2_lcv$LAST_CAT_VALUE, amazon_dataset_2_lcv$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list2_3 <- agrep(amazon_dataset_2_lcv$Product_Category_1, amazon_dataset_2_lcv$Bsr, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list3_1 <- agrep(amazon_dataset_3_lcv$LAST_CAT_VALUE, amazon_dataset_3_lcv$Product_Description, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list3_2 <- agrep(amazon_dataset_3_lcv$LAST_CAT_VALUE, amazon_dataset_3_lcv$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list3_3 <- agrep(amazon_dataset_3_lcv$Product_Category_1, amazon_dataset_3_lcv$Bsr, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list4_1 <- agrep(amazon_dataset_4_lcv$LAST_CAT_VALUE, amazon_dataset_4_lcv$Product_Description, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list4_2 <- agrep(amazon_dataset_4_lcv$LAST_CAT_VALUE, amazon_dataset_4_lcv$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list4_3 <- agrep(amazon_dataset_4_lcv$Product_Category_1, amazon_dataset_4_lcv$Bsr, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list5_1 <- agrep(amazon_dataset_5_lcv$LAST_CAT_VALUE, amazon_dataset_5_lcv$Product_Description, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list5_2 <- agrep(amazon_dataset_5_lcv$LAST_CAT_VALUE, amazon_dataset_5_lcv$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list5_3 <- agrep(amazon_dataset_5_lcv$Product_Category_1, amazon_dataset_5_lcv$Bsr, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list6_1 <- agrep(amazon_dataset_6_lcv$LAST_CAT_VALUE, amazon_dataset_6_lcv$Product_Description, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list6_2 <- agrep(amazon_dataset_6_lcv$LAST_CAT_VALUE, amazon_dataset_6_lcv$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list6_3 <- agrep(amazon_dataset_6_lcv$Product_Category_1, amazon_dataset_6_lcv$Bsr, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list7_1 <- agrep(amazon_dataset_7_lcv$LAST_CAT_VALUE, amazon_dataset_7_lcv$Product_Description, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list7_2 <- agrep(amazon_dataset_7_lcv$LAST_CAT_VALUE, amazon_dataset_7_lcv$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list7_3 <- agrep(amazon_dataset_7_lcv$Product_Category_1, amazon_dataset_7_lcv$Bsr, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list8_1 <- agrep(amazon_dataset_8_lcv$LAST_CAT_VALUE, amazon_dataset_8_lcv$Product_Description, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list8_2 <- agrep(amazon_dataset_8_lcv$LAST_CAT_VALUE, amazon_dataset_8_lcv$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list8_3 <- agrep(amazon_dataset_8_lcv$Product_Category_1, amazon_dataset_8_lcv$Bsr, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list9_1 <- agrep(amazon_dataset_9_lcv$LAST_CAT_VALUE, amazon_dataset_9_lcv$Product_Description, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list9_2 <- agrep(amazon_dataset_9_lcv$LAST_CAT_VALUE, amazon_dataset_9_lcv$Product_Contents, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
match_list9_3 <- agrep(amazon_dataset_9_lcv$Product_Category_1, amazon_dataset_9_lcv$Bsr, ignore.case = TRUE, value = FALSE, max.distance = 0.75)
amazon_dataset_2_lcv$match1 <- "NO"
amazon_dataset_2_lcv$match2 <- "NO"
amazon_dataset_2_lcv$match3 <- "NO"
amazon_dataset_3_lcv$match1 <- "NO"
amazon_dataset_3_lcv$match2 <- "NO"
amazon_dataset_3_lcv$match3 <- "NO"
amazon_dataset_4_lcv$match1 <- "NO"
amazon_dataset_4_lcv$match2 <- "NO"
amazon_dataset_4_lcv$match3 <- "NO"
amazon_dataset_5_lcv$match1 <- "NO"
amazon_dataset_5_lcv$match2 <- "NO"
amazon_dataset_5_lcv$match3 <- "NO"
amazon_dataset_6_lcv$match1 <- "NO"
amazon_dataset_6_lcv$match2 <- "NO"
amazon_dataset_6_lcv$match3 <- "NO"
amazon_dataset_7_lcv$match1 <- "NO"
amazon_dataset_7_lcv$match2 <- "NO"
amazon_dataset_7_lcv$match3 <- "NO"
amazon_dataset_8_lcv$match1 <- "NO"
amazon_dataset_8_lcv$match2 <- "NO"
amazon_dataset_8_lcv$match3 <- "NO"
amazon_dataset_9_lcv$match1 <- "NO"
amazon_dataset_9_lcv$match2 <- "NO"
amazon_dataset_9_lcv$match3 <- "NO"
amazon_dataset_2_lcv$match1[match_list2_1] <- "YES"
amazon_dataset_2_lcv$match2[match_list2_2] <- "YES"
amazon_dataset_2_lcv$match3[match_list2_3] <- "YES"
amazon_dataset_3_lcv$match1[match_list3_1] <- "YES"
amazon_dataset_3_lcv$match2[match_list3_2] <- "YES"
amazon_dataset_3_lcv$match3[match_list3_3] <- "YES"
amazon_dataset_4_lcv$match1[match_list4_1] <- "YES"
amazon_dataset_4_lcv$match2[match_list4_2] <- "YES"
amazon_dataset_4_lcv$match3[match_list4_3] <- "YES"
amazon_dataset_5_lcv$match1[match_list5_1] <- "YES"
amazon_dataset_5_lcv$match2[match_list5_2] <- "YES"
amazon_dataset_5_lcv$match3[match_list5_3] <- "YES"
amazon_dataset_6_lcv$match1[match_list6_1] <- "YES"
amazon_dataset_6_lcv$match2[match_list6_2] <- "YES"
amazon_dataset_6_lcv$match3[match_list6_3] <- "YES"
amazon_dataset_7_lcv$match1[match_list7_1] <- "YES"
amazon_dataset_7_lcv$match2[match_list7_2] <- "YES"
amazon_dataset_7_lcv$match3[match_list7_3] <- "YES"
amazon_dataset_8_lcv$match1[match_list8_1] <- "YES"
amazon_dataset_8_lcv$match2[match_list8_2] <- "YES"
amazon_dataset_8_lcv$match3[match_list8_3] <- "YES"
amazon_dataset_9_lcv$match1[match_list9_1] <- "YES"
amazon_dataset_9_lcv$match2[match_list9_2] <- "YES"
amazon_dataset_9_lcv$match3[match_list9_3] <- "YES"
write.csv(amazon_dataset_2_lcv,"G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_2_matchcolumns.csv", row.names = FALSE)
write.csv(amazon_dataset_3_lcv,"G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_3_matchcolumns.csv", row.names = FALSE)
write.csv(amazon_dataset_4_lcv,"G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_4_matchcolumns.csv", row.names = FALSE)
write.csv(amazon_dataset_5_lcv,"G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_5_matchcolumns.csv", row.names = FALSE)
write.csv(amazon_dataset_6_lcv,"G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_6_matchcolumns.csv", row.names = FALSE)
write.csv(amazon_dataset_7_lcv,"G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_7_matchcolumns.csv", row.names = FALSE)
write.csv(amazon_dataset_8_lcv,"G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_8_matchcolumns.csv", row.names = FALSE)
write.csv(amazon_dataset_9_lcv,"G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\dirty datasets\\amazon_dataset_9_matchcolumns.csv", row.names = FALSE)
AFTER MANUAL AUDITING, I IMPORT THE AUDITED DATASETS BACK IN
I DETERMINED WHAT LINES NEEDED ATTENTION IF ALL THREE MATCHES RUN RETURNED NO AND DELETED THEM IF NEEDED
setwd("C:\\Users\\ubore\\Desktop\\clean ds")
all_matched_cleaned_files <- as.vector(list.files(path = "C:\\Users\\ubore\\Desktop\\clean ds", pattern = "*.xlsx"))
view(all_matched_cleaned_files)
clean_files_full_path <- all_matched_cleaned_files
clean_files_full_path <- paste0("C:\\Users\\ubore\\Desktop\\clean ds\\", all_matched_cleaned_files)
setwd("C:\\Users\\ubore\\Desktop\\clean ds")
amazon_dataset_1_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[1], read_xlsx))
amazon_dataset_2_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[2], read_xlsx))
amazon_dataset_3_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[3], read_xlsx))
amazon_dataset_4_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[4], read_xlsx))
amazon_dataset_5_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[5], read_xlsx))
amazon_dataset_6_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[6], read_xlsx))
amazon_dataset_7_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[7], read_xlsx))
amazon_dataset_8_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[8], read_xlsx))
amazon_dataset_9_match_clean <- as.data.frame(lapply(all_matched_cleaned_files[9], read_xlsx))
amazon_dataset_1_match_price <- drop_na(amazon_dataset_1_match_clean, "Product_Price")
amazon_dataset_2_match_price <- drop_na(amazon_dataset_2_match_clean, "Product_Price")
amazon_dataset_3_match_price <- drop_na(amazon_dataset_3_match_clean, "Product_Price")
amazon_dataset_4_match_price <- drop_na(amazon_dataset_4_match_clean, "Product_Price")
amazon_dataset_5_match_price <- drop_na(amazon_dataset_5_match_clean, "Product_Price")
amazon_dataset_6_match_price <- drop_na(amazon_dataset_6_match_clean, "Product_Price")
amazon_dataset_7_match_price <- drop_na(amazon_dataset_7_match_clean, "Product_Price")
amazon_dataset_8_match_price <- drop_na(amazon_dataset_8_match_clean, "Product_Price")
amazon_dataset_9_match_price <- drop_na(amazon_dataset_9_match_clean, "Product_Price")
mean1 <- mean(amazon_dataset_1_match_price$Product_Price)
mean2 <- mean(amazon_dataset_2_match_price$Product_Price)
mean3 <- mean(amazon_dataset_3_match_price$Product_Price)
mean4 <- mean(amazon_dataset_4_match_price$Product_Price)
mean5 <- mean(amazon_dataset_5_match_price$Product_Price)
mean6 <- mean(amazon_dataset_6_match_price$Product_Price)
mean7 <- mean(amazon_dataset_7_match_price$Product_Price)
mean8 <- mean(amazon_dataset_8_match_price$Product_Price)
mean9 <- mean(amazon_dataset_9_match_price$Product_Price)
median1 <- median(amazon_dataset_1_match_price$Product_Price)
median2 <- median(amazon_dataset_2_match_price$Product_Price)
median3 <- median(amazon_dataset_3_match_price$Product_Price)
median4 <- median(amazon_dataset_4_match_price$Product_Price)
median5 <- median(amazon_dataset_5_match_price$Product_Price)
median6 <- median(amazon_dataset_6_match_price$Product_Price)
median7 <- median(amazon_dataset_7_match_price$Product_Price)
median8 <- median(amazon_dataset_8_match_price$Product_Price)
median9 <- median(amazon_dataset_9_match_price$Product_Price)
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
mode1 <- getmode(amazon_dataset_1_match_price$Product_Price)
mode2 <- getmode(amazon_dataset_2_match_price$Product_Price)
mode3 <- getmode(amazon_dataset_3_match_price$Product_Price)
mode4 <- getmode(amazon_dataset_4_match_price$Product_Price)
mode5 <- getmode(amazon_dataset_5_match_price$Product_Price)
mode6 <- getmode(amazon_dataset_6_match_price$Product_Price)
mode7 <- getmode(amazon_dataset_7_match_price$Product_Price)
mode8 <- getmode(amazon_dataset_8_match_price$Product_Price)
mode9 <- getmode(amazon_dataset_9_match_price$Product_Price)
count1 <- nrow(amazon_dataset_1_match_price)
count2 <- nrow(amazon_dataset_2_match_price)
count3 <- nrow(amazon_dataset_3_match_price)
count4 <- nrow(amazon_dataset_4_match_price)
count5 <- nrow(amazon_dataset_5_match_price)
count6 <- nrow(amazon_dataset_6_match_price)
count7 <- nrow(amazon_dataset_7_match_price)
count8 <- nrow(amazon_dataset_8_match_price)
count9 <- nrow(amazon_dataset_9_match_price)
min_max_1 <- c(min(amazon_dataset_1_match_price$Product_Price), max(amazon_dataset_1_match_price$Product_Price))
min_max_2 <- c(min(amazon_dataset_2_match_price$Product_Price), max(amazon_dataset_2_match_price$Product_Price))
min_max_3 <- c(min(amazon_dataset_3_match_price$Product_Price), max(amazon_dataset_3_match_price$Product_Price))
min_max_4 <- c(min(amazon_dataset_4_match_price$Product_Price), max(amazon_dataset_4_match_price$Product_Price))
min_max_5 <- c(min(amazon_dataset_5_match_price$Product_Price), max(amazon_dataset_5_match_price$Product_Price))
min_max_6 <- c(min(amazon_dataset_6_match_price$Product_Price), max(amazon_dataset_6_match_price$Product_Price))
min_max_7 <- c(min(amazon_dataset_7_match_price$Product_Price), max(amazon_dataset_7_match_price$Product_Price))
min_max_8 <- c(min(amazon_dataset_8_match_price$Product_Price), max(amazon_dataset_8_match_price$Product_Price))
min_max_9 <- c(min(amazon_dataset_9_match_price$Product_Price), max(amazon_dataset_9_match_price$Product_Price))
price_hist_1 <- ggplot(amazon_dataset_1_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_2 <- ggplot(amazon_dataset_2_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_3 <- ggplot(amazon_dataset_3_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_4 <- ggplot(amazon_dataset_4_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_5 <- ggplot(amazon_dataset_5_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_6 <- ggplot(amazon_dataset_6_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_7 <- ggplot(amazon_dataset_7_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_8 <- ggplot(amazon_dataset_8_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
price_hist_9 <- ggplot(amazon_dataset_9_match_price, aes(x=Product_Price)) + geom_histogram(binwidth = 10)
base_values_df_1 <- as.vector(c(desired_cats[1,1], round(mean1, digits = 2), round(median1,digits = 2), round(mode1,digits = 2), round(min_max_1,digits = 2), count1))
base_values_df_2 <- as.vector(c(desired_cats[2,1], round(mean2, digits = 2), round(median2,digits = 2), round(mode2,digits = 2), round(min_max_2,digits = 2), count2))
base_values_df_3 <- as.vector(c(desired_cats[3,1], round(mean3, digits = 2), round(median3,digits = 2), round(mode3,digits = 2), round(min_max_3,digits = 2), count3))
base_values_df_4 <- as.vector(c(desired_cats[4,1], round(mean4, digits = 2), round(median4,digits = 2), round(mode4,digits = 2), round(min_max_4,digits = 2), count4))
base_values_df_5 <- as.vector(c(desired_cats[5,1], round(mean5, digits = 2), round(median5,digits = 2), round(mode5,digits = 2), round(min_max_5,digits = 2), count5))
base_values_df_6 <- as.vector(c(desired_cats[6,1], round(mean6, digits = 2), round(median6,digits = 2), round(mode6,digits = 2), round(min_max_6,digits = 2), count6))
base_values_df_7 <- as.vector(c(desired_cats[7,1], round(mean7, digits = 2), round(median7,digits = 2), round(mode7,digits = 2), round(min_max_7,digits = 2), count7))
base_values_df_8 <- as.vector(c(desired_cats[8,1], round(mean8, digits = 2), round(median8,digits = 2), round(mode8,digits = 2), round(min_max_8,digits = 2), count8))
base_values_df_9 <- as.vector(c(desired_cats[9,1], round(mean9, digits = 2), round(median9,digits = 2), round(mode9,digits = 2), round(min_max_9,digits = 2), count9))
names(base_values_df_1) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_2) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_3) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_4) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_5) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_6) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_7) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_8) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_9) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
AFTER OBSERVING THE HISTOGRAMS AND BASE VALUES
REMOVE ALL PRODUCTS BELOW 5 DOLLARS
MAXIMUM VALUES THAT DON’T REPRESENT THE CATEGORY BY EACH INDIVIDUAL CATEGORY
removal_threshold_min <- 4.99
amazon_dataset_1_removed_mins <- subset(amazon_dataset_1_match_price, amazon_dataset_1_match_price$Product_Price > removal_threshold_min)
amazon_dataset_2_removed_mins <- subset(amazon_dataset_2_match_price, amazon_dataset_2_match_price$Product_Price > removal_threshold_min)
amazon_dataset_3_removed_mins <- subset(amazon_dataset_3_match_price, amazon_dataset_3_match_price$Product_Price > removal_threshold_min)
amazon_dataset_4_removed_mins <- subset(amazon_dataset_4_match_price, amazon_dataset_4_match_price$Product_Price > removal_threshold_min)
amazon_dataset_5_removed_mins <- subset(amazon_dataset_5_match_price, amazon_dataset_5_match_price$Product_Price > removal_threshold_min)
amazon_dataset_6_removed_mins <- subset(amazon_dataset_6_match_price, amazon_dataset_6_match_price$Product_Price > removal_threshold_min)
amazon_dataset_7_removed_mins <- subset(amazon_dataset_7_match_price, amazon_dataset_7_match_price$Product_Price > removal_threshold_min)
amazon_dataset_8_removed_mins <- subset(amazon_dataset_8_match_price, amazon_dataset_8_match_price$Product_Price > removal_threshold_min)
amazon_dataset_9_removed_mins <- subset(amazon_dataset_9_match_price, amazon_dataset_9_match_price$Product_Price > removal_threshold_min)
SUBSET – REMOVAL OF LARGE OUTLIER VALUES BASED ON BASE VALUES/HISTOGRAMS
REMOVED CATEGORY 8 (INDUSTRIAL AND SCIENTIFIC) IT IS THE MOST NON-NORMAL DF AND A VERTICAL I DON’T HAVE INTEREST IN EXPLORING FURTHER
removal_threshold_max_1 <- 800
removal_threshold_max_2 <- 1000
removal_threshold_max_3 <- 280
removal_threshold_max_4 <- 625
removal_threshold_max_5 <- 50
removal_threshold_max_6 <- 275
removal_threshold_max_7 <- 125
removal_threshold_max_9 <- 100
amazon_dataset_1_removed_outliers <- subset(amazon_dataset_1_removed_mins, amazon_dataset_1_removed_mins$Product_Price <= removal_threshold_max_1)
amazon_dataset_2_removed_outliers <- subset(amazon_dataset_2_removed_mins, amazon_dataset_2_removed_mins$Product_Price <= removal_threshold_max_2)
amazon_dataset_3_removed_outliers <- subset(amazon_dataset_3_removed_mins, amazon_dataset_3_removed_mins$Product_Price <= removal_threshold_max_3)
amazon_dataset_4_removed_outliers <- subset(amazon_dataset_4_removed_mins, amazon_dataset_4_removed_mins$Product_Price <= removal_threshold_max_4)
amazon_dataset_5_removed_outliers <- subset(amazon_dataset_5_removed_mins, amazon_dataset_5_removed_mins$Product_Price <= removal_threshold_max_5)
amazon_dataset_6_removed_outliers <- subset(amazon_dataset_6_removed_mins, amazon_dataset_6_removed_mins$Product_Price <= removal_threshold_max_6)
amazon_dataset_7_removed_outliers <- subset(amazon_dataset_7_removed_mins, amazon_dataset_7_removed_mins$Product_Price <= removal_threshold_max_7)
amazon_dataset_9_removed_outliers <- subset(amazon_dataset_9_removed_mins, amazon_dataset_9_removed_mins$Product_Price <= removal_threshold_max_9)
mean1 <- mean(amazon_dataset_1_removed_outliers$Product_Price)
mean2 <- mean(amazon_dataset_2_removed_outliers$Product_Price)
mean3 <- mean(amazon_dataset_3_removed_outliers$Product_Price)
mean4 <- mean(amazon_dataset_4_removed_outliers$Product_Price)
mean5 <- mean(amazon_dataset_5_removed_outliers$Product_Price)
mean6 <- mean(amazon_dataset_6_removed_outliers$Product_Price)
mean7 <- mean(amazon_dataset_7_removed_outliers$Product_Price)
mean9 <- mean(amazon_dataset_9_removed_outliers$Product_Price)
median1 <- median(amazon_dataset_1_removed_outliers$Product_Price)
median2 <- median(amazon_dataset_2_removed_outliers$Product_Price)
median3 <- median(amazon_dataset_3_removed_outliers$Product_Price)
median4 <- median(amazon_dataset_4_removed_outliers$Product_Price)
median5 <- median(amazon_dataset_5_removed_outliers$Product_Price)
median6 <- median(amazon_dataset_6_removed_outliers$Product_Price)
median7 <- median(amazon_dataset_7_removed_outliers$Product_Price)
median9 <- median(amazon_dataset_9_removed_outliers$Product_Price)
mode1 <- getmode(amazon_dataset_1_removed_outliers$Product_Price)
mode2 <- getmode(amazon_dataset_2_removed_outliers$Product_Price)
mode3 <- getmode(amazon_dataset_3_removed_outliers$Product_Price)
mode4 <- getmode(amazon_dataset_4_removed_outliers$Product_Price)
mode5 <- getmode(amazon_dataset_5_removed_outliers$Product_Price)
mode6 <- getmode(amazon_dataset_6_removed_outliers$Product_Price)
mode7 <- getmode(amazon_dataset_7_removed_outliers$Product_Price)
mode9 <- getmode(amazon_dataset_9_removed_outliers$Product_Price)
count1 <- nrow(amazon_dataset_1_removed_outliers)
count2 <- nrow(amazon_dataset_2_removed_outliers)
count3 <- nrow(amazon_dataset_3_removed_outliers)
count4 <- nrow(amazon_dataset_4_removed_outliers)
count5 <- nrow(amazon_dataset_5_removed_outliers)
count6 <- nrow(amazon_dataset_6_removed_outliers)
count7 <- nrow(amazon_dataset_7_removed_outliers)
count9 <- nrow(amazon_dataset_9_removed_outliers)
min_max_1 <- c(min(amazon_dataset_1_removed_outliers$Product_Price), max(amazon_dataset_1_removed_outliers$Product_Price))
min_max_2 <- c(min(amazon_dataset_2_removed_outliers$Product_Price), max(amazon_dataset_2_removed_outliers$Product_Price))
min_max_3 <- c(min(amazon_dataset_3_removed_outliers$Product_Price), max(amazon_dataset_3_removed_outliers$Product_Price))
min_max_4 <- c(min(amazon_dataset_4_removed_outliers$Product_Price), max(amazon_dataset_4_removed_outliers$Product_Price))
min_max_5 <- c(min(amazon_dataset_5_removed_outliers$Product_Price), max(amazon_dataset_5_removed_outliers$Product_Price))
min_max_6 <- c(min(amazon_dataset_6_removed_outliers$Product_Price), max(amazon_dataset_6_removed_outliers$Product_Price))
min_max_7 <- c(min(amazon_dataset_7_removed_outliers$Product_Price), max(amazon_dataset_7_removed_outliers$Product_Price))
min_max_9 <- c(min(amazon_dataset_9_removed_outliers$Product_Price), max(amazon_dataset_9_removed_outliers$Product_Price))
base_values_df_1 <- as.vector(c(amazon_dataset_1_removed_outliers[1,7], round(mean1, digits = 2), round(median1,digits = 2), round(mode1,digits = 2), round(min_max_1,digits = 2), count1))
base_values_df_2 <- as.vector(c(amazon_dataset_2_removed_outliers[1,7], round(mean2, digits = 2), round(median2,digits = 2), round(mode2,digits = 2), round(min_max_2,digits = 2), count2))
base_values_df_3 <- as.vector(c(amazon_dataset_3_removed_outliers[1,7], round(mean3, digits = 2), round(median3,digits = 2), round(mode3,digits = 2), round(min_max_3,digits = 2), count3))
base_values_df_4 <- as.vector(c(amazon_dataset_4_removed_outliers[1,7], round(mean4, digits = 2), round(median4,digits = 2), round(mode4,digits = 2), round(min_max_4,digits = 2), count4))
base_values_df_5 <- as.vector(c(amazon_dataset_5_removed_outliers[1,7], round(mean5, digits = 2), round(median5,digits = 2), round(mode5,digits = 2), round(min_max_5,digits = 2), count5))
base_values_df_6 <- as.vector(c(amazon_dataset_6_removed_outliers[1,7], round(mean6, digits = 2), round(median6,digits = 2), round(mode6,digits = 2), round(min_max_6,digits = 2), count6))
base_values_df_7 <- as.vector(c(amazon_dataset_7_removed_outliers[1,7], round(mean7, digits = 2), round(median7,digits = 2), round(mode7,digits = 2), round(min_max_7,digits = 2), count7))
base_values_df_9 <- as.vector(c(amazon_dataset_9_removed_outliers[1,7], round(mean9, digits = 2), round(median9,digits = 2), round(mode9,digits = 2), round(min_max_9,digits = 2), count9))
names(base_values_df_1) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_2) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_3) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_4) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_5) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_6) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_7) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
names(base_values_df_9) <- c("Product Category","Mean","Median","Mode","Min","Max","Product Count")
price_hist_1 <- ggplot(amazon_dataset_1_removed_outliers, aes(x=Product_Price)) +
geom_histogram(binwidth = 5, color = "white") +
geom_vline(aes(xintercept = mean1, color = "mean")) +
geom_vline(aes(xintercept = median1, color = "median")) +
geom_vline(aes(xintercept = mode1, color = "mode"))
price_hist_2 <- ggplot(amazon_dataset_2_removed_outliers, aes(x=Product_Price)) + geom_histogram(binwidth = 5,color = "white") +
geom_vline(aes(xintercept = mean2, color = "mean")) +
geom_vline(aes(xintercept = median2, color ="median")) +
geom_vline(aes(xintercept = mode2, color = "mode"))
price_hist_3 <- ggplot(amazon_dataset_3_removed_outliers, aes(x=Product_Price)) + geom_histogram(binwidth = 5,color = "white") +
geom_vline(aes(xintercept = mean3,color="mean")) +
geom_vline(aes(xintercept = median3,color="median")) +
geom_vline(aes(xintercept = mode3, color = "mode"))
price_hist_4 <- ggplot(amazon_dataset_4_removed_outliers, aes(x=Product_Price)) + geom_histogram(binwidth = 5,color = "white") +
geom_vline(aes(xintercept = mean4,color="mean")) +
geom_vline(aes(xintercept = median4,color="median")) +
geom_vline(aes(xintercept = mode4, color = "mode"))
price_hist_5 <- ggplot(amazon_dataset_5_removed_outliers, aes(x=Product_Price)) + geom_histogram(binwidth = 10,color = "white") +
geom_vline(aes(xintercept = mean5,color="mean")) +
geom_vline(aes(xintercept = median5,color="median")) +
geom_vline(aes(xintercept = mode5,color = "mode"))
price_hist_6 <- ggplot(amazon_dataset_6_removed_outliers, aes(x=Product_Price)) + geom_histogram(binwidth = 5,color = "white") +
geom_vline(aes(xintercept = mean6,color="mean")) +
geom_vline(aes(xintercept = median6,color="median")) +
geom_vline(aes(xintercept = mode6, color = "mode"))
price_hist_7 <- ggplot(amazon_dataset_7_removed_outliers, aes(x=Product_Price)) + geom_histogram(binwidth = 5,color = "white") +
geom_vline(aes(xintercept = mean7,color="mean")) +
geom_vline(aes(xintercept = median7,color="median")) +
geom_vline(aes(xintercept = mode7, color = "mode"))
price_hist_9 <- ggplot(amazon_dataset_9_removed_outliers, aes(x=Product_Price)) + geom_histogram(binwidth = 2,color = "white") +
geom_vline(aes(xintercept = mean9,color="mean")) +
geom_vline(aes(xintercept = median9,color="median"))+
geom_vline(aes(xintercept = mode9, color = "mode"))
MOST OF THE DATAFRAMES ARE RIGHT SKEWED SO I NORMALIZED THE DATA USING LOG10 CALCULATIONS
FINDING SKEWNESS COEFFICENT
sc_1 <- skewness(amazon_dataset_1_removed_outliers$Product_Price)
sc_2 <- skewness(amazon_dataset_2_removed_outliers$Product_Price)
sc_3 <- skewness(amazon_dataset_3_removed_outliers$Product_Price)
sc_4 <- skewness(amazon_dataset_4_removed_outliers$Product_Price)
sc_5 <- skewness(amazon_dataset_5_removed_outliers$Product_Price)
sc_6 <- skewness(amazon_dataset_6_removed_outliers$Product_Price)
sc_7 <- skewness(amazon_dataset_7_removed_outliers$Product_Price)
sc_9 <- skewness(amazon_dataset_9_removed_outliers$Product_Price)
skewness_coes <- as.vector(c(sc_1,sc_2,sc_3,sc_4,sc_5,sc_6,sc_7,sc_9))
names(skewness_coes) <- c("df1","df2","df3","df4","df5","df6","df7","df9")
ads_1_log <- log10(amazon_dataset_1_removed_outliers$Product_Price)
ads_2_log <- log10(amazon_dataset_2_removed_outliers$Product_Price)
ads_3_log <- log10(amazon_dataset_3_removed_outliers$Product_Price)
ads_4_log <- log10(amazon_dataset_4_removed_outliers$Product_Price)
ads_5_log <- log10(amazon_dataset_5_removed_outliers$Product_Price)
ads_6_log <- log10(amazon_dataset_6_removed_outliers$Product_Price)
ads_7_log <- log10(amazon_dataset_7_removed_outliers$Product_Price)
ads_9_log <- log10(amazon_dataset_9_removed_outliers$Product_Price)
ads_1_log <- as.data.frame(ads_1_log)
ads_2_log <- as.data.frame(ads_2_log)
ads_3_log <- as.data.frame(ads_3_log)
ads_4_log <- as.data.frame(ads_4_log)
ads_5_log <- as.data.frame(ads_5_log)
ads_6_log <- as.data.frame(ads_6_log)
ads_7_log <- as.data.frame(ads_7_log)
ads_9_log <- as.data.frame(ads_9_log)
names(ads_1_log) <- "Product_Price"
names(ads_2_log) <- "Product_Price"
names(ads_3_log) <- "Product_Price"
names(ads_4_log) <- "Product_Price"
names(ads_5_log) <- "Product_Price"
names(ads_6_log) <- "Product_Price"
names(ads_7_log) <- "Product_Price"
names(ads_9_log) <- "Product_Price"
mean1_log <- mean(ads_1_log$Product_Price)
median1_log <- median(ads_1_log$Product_Price)
mode1_log <- getmode(ads_1_log$Product_Price)
count1_log <- as.numeric(nrow(ads_1_log))
ads_1_log_density <- ggdensity(ads_1_log, x = "Product_Price", fill = "lightgray", title = "Product Price, Home & Kitchen, Normalized") +
stat_overlay_normal_density(color = "red", linetype = "dashed")
var_ads_1 <- var(ads_1_log$Product_Price)
sd_ads_1 <- sd(ads_1_log$Product_Price)
sqrt_count_ads1 <- sqrt(count1_log)
se_ads_1 <- sd_ads_1/sqrt_count_ads1
DETERMINE HOW STATISTICALLY CLOSE THE SAMPLE MEAN IS TO THE POPULATION MEAN
WHETHER OR NOT THE DIFFERENCE BETWEEN THE MEANS OF EACH PRODUCT CATEGORY IS STATISTICALLY SIGNIFICANT
alpha <- 0.05
alpha_99 <- 0.01
degrees_freedom_1 <- count1_log - 1
degrees_freedom_2 <- count2_log - 1
degrees_freedom_3 <- count3_log - 1
degrees_freedom_4 <- count4_log - 1
degrees_freedom_5 <- count5_log - 1
degrees_freedom_6 <- count6_log - 1
degrees_freedom_7 <- count7_log - 1
degrees_freedom_9 <- count9_log - 1
t_score_1 <- qt(p=alpha/2, df=degrees_freedom_1, lower.tail = FALSE)
t_score_2 <- qt(p=alpha/2, df=degrees_freedom_2, lower.tail = FALSE)
t_score_3 <- qt(p=alpha/2, df=degrees_freedom_3, lower.tail = FALSE)
t_score_4 <- qt(p=alpha/2, df=degrees_freedom_4, lower.tail = FALSE)
t_score_5 <- qt(p=alpha/2, df=degrees_freedom_5, lower.tail = FALSE)
t_score_6 <- qt(p=alpha/2, df=degrees_freedom_6, lower.tail = FALSE)
t_score_7 <- qt(p=alpha/2, df=degrees_freedom_7, lower.tail = FALSE)
t_score_9 <- qt(p=alpha/2, df=degrees_freedom_9, lower.tail = FALSE)
t_score_1_99 <- qt(p=alpha_99/2, df=degrees_freedom_1, lower.tail = FALSE)
t_score_2_99 <- qt(p=alpha_99/2, df=degrees_freedom_2, lower.tail = FALSE)
t_score_3_99 <- qt(p=alpha_99/2, df=degrees_freedom_3, lower.tail = FALSE)
t_score_4_99 <- qt(p=alpha_99/2, df=degrees_freedom_4, lower.tail = FALSE)
t_score_5_99 <- qt(p=alpha_99/2, df=degrees_freedom_5, lower.tail = FALSE)
t_score_6_99 <- qt(p=alpha_99/2, df=degrees_freedom_6, lower.tail = FALSE)
t_score_7_99 <- qt(p=alpha_99/2, df=degrees_freedom_7, lower.tail = FALSE)
t_score_9_99 <- qt(p=alpha_99/2, df=degrees_freedom_9, lower.tail = FALSE)
margin_error_1 <- t_score_1 * se_ads_1
margin_error_2 <- t_score_2 * se_ads_2
margin_error_3 <- t_score_3 * se_ads_3
margin_error_4 <- t_score_4 * se_ads_4
margin_error_5 <- t_score_5 * se_ads_5
margin_error_6 <- t_score_6 * se_ads_6
margin_error_7 <- t_score_7 * se_ads_7
margin_error_9 <- t_score_9 * se_ads_9
margin_error_1_99 <- t_score_1_99 * se_ads_1
margin_error_2_99 <- t_score_2_99 * se_ads_2
margin_error_3_99 <- t_score_3_99 * se_ads_3
margin_error_4_99 <- t_score_4_99 * se_ads_4
margin_error_5_99 <- t_score_5_99 * se_ads_5
margin_error_6_99 <- t_score_6_99 * se_ads_6
margin_error_7_99 <- t_score_7_99 * se_ads_7
margin_error_9_99 <- t_score_9_99 * se_ads_9
lower_bound_1 <- mean1_log - margin_error_1
upper_bound_1 <- mean1_log + margin_error_1
lower_bound_2 <- mean2_log - margin_error_2
upper_bound_2 <- mean2_log + margin_error_2
lower_bound_3 <- mean3_log - margin_error_3
upper_bound_3 <- mean3_log + margin_error_3
lower_bound_4 <- mean4_log - margin_error_4
upper_bound_4 <- mean4_log + margin_error_4
lower_bound_5 <- mean5_log - margin_error_5
upper_bound_5 <- mean5_log + margin_error_5
lower_bound_6 <- mean6_log - margin_error_6
upper_bound_6 <- mean6_log + margin_error_6
lower_bound_7 <- mean7_log - margin_error_7
upper_bound_7 <- mean7_log + margin_error_7
lower_bound_9 <- mean9_log - margin_error_9
upper_bound_9 <- mean9_log + margin_error_9
lower_bound_1_99 <- mean1_log - margin_error_1_99
upper_bound_1_99 <- mean1_log + margin_error_1_99
lower_bound_2_99 <- mean2_log - margin_error_2_99
upper_bound_2_99 <- mean2_log + margin_error_2_99
lower_bound_3_99 <- mean3_log - margin_error_3_99
upper_bound_3_99 <- mean3_log + margin_error_3_99
lower_bound_4_99 <- mean4_log - margin_error_4_99
upper_bound_4_99 <- mean4_log + margin_error_4_99
lower_bound_5_99 <- mean5_log - margin_error_5_99
upper_bound_5_99 <- mean5_log + margin_error_5_99
lower_bound_6_99 <- mean6_log - margin_error_6_99
upper_bound_6_99 <- mean6_log + margin_error_6_99
lower_bound_7_99 <- mean7_log - margin_error_7_99
upper_bound_7_99 <- mean7_log + margin_error_7_99
lower_bound_9_99 <- mean9_log - margin_error_9_99
upper_bound_9_99 <- mean9_log + margin_error_9_99
CI_95_PLOT <- plotCI(x=CIs_ADS_95[,4],ui=CIs_ADS_95[,3],li=CIs_ADS_95[,2], main = "CI, 95%", xlab = "Dataframes 1-7,9 (8 = 9)", ylab= "mean")
CI_99_PLOT <- plotCI(x=CIs_ADS_99[,4],ui=CIs_ADS_99[,3],li=CIs_ADS_99[,2], main = "CI, 99%", xlab = "Dataframes 1-7,9 (8 = 9)", ylab= "mean")
THE DATAFRAMES THAT HAVE THE SMALLEST GAP BETWEEN THE LOWER/UPPER BOUND FOR THE MEAN
THE DATAFRAMES THAT HAVE THE HIGHEST PRODUCT COUNTS
DF1 DF2 DF3
names(ads_1_log) <- "Product_Price_Norm"
names(ads_2_log) <- "Product_Price_Norm"
names(ads_3_log) <- "Product_Price_Norm"
ads_1_df_norm_values <- cbind.data.frame(amazon_dataset_1_removed_outliers, ads_1_log)
ads_2_df_norm_values <- cbind.data.frame(amazon_dataset_2_removed_outliers, ads_2_log)
ads_3_df_norm_values <- cbind.data.frame(amazon_dataset_3_removed_outliers, ads_3_log)
ads_1_log_unlist <- unlist(ads_1_log)
ads_2_log_unlist <- unlist(ads_2_log)
ads_3_log_unlist <- unlist(ads_3_log)
var_test_1and2 <- var.test(ads_1_log_unlist ,ads_2_log_unlist, alternative = "two.sided")
var_test_2and3 <- var.test(ads_2_log_unlist ,ads_3_log_unlist, alternative = "two.sided")
var_test_1and3 <- var.test(ads_1_log_unlist ,ads_3_log_unlist, alternative = "two.sided")
F test to compare two variances
data: ads_1_log_unlist and ads_2_log_unlist F = 0.94399, num df = 10814, denom df = 4033, p-value = 0.02625 alternative hypothesis: true ratio of variances is not equal to 1 95 percent confidence interval: 0.8966503 0.9932285 sample estimates: ratio of variances 0.9439905
F test to compare two variances
data: ads_1_log_unlist and ads_3_log_unlist F = 1.2717, num df = 10814, denom df = 4498, p-value < 2.2e-16 alternative hypothesis: true ratio of variances is not equal to 1 95 percent confidence interval: 1.210346 1.335457 sample estimates: ratio of variances 1.271685
F test to compare two variances
data: ads_2_log_unlist and ads_3_log_unlist F = 1.3471, num df = 4033, denom df = 4498, p-value < 2.2e-16 alternative hypothesis: true ratio of variances is not equal to 1 95 percent confidence interval: 1.268601 1.430678 sample estimates: ratio of variances 1.347137
THE P VALUE FOR ALL TESTS ARE GREATER THAN ALPHA (0.05)
STUDENT T TEST CAN NOT BE PERFORMED
H-NULL TRUE DIFFERENCE IN MEANS = 0
H-ALT TRUE DIFFERENCE IN MEANS =/= 0
ALPHA IS 0.05
welch_means1and2 <- t.test(ads_1_log_unlist, ads_2_log_unlist)
welch_means2and3 <- t.test(ads_2_log_unlist, ads_3_log_unlist)
welch_means1and3 <- t.test(ads_1_log_unlist, ads_3_log_unlist)
print(welch_means1and2$p.value)
print(welch_means2and3$p.value)
print(welch_means1and3$p.value)
[1] 9.23992e-05
[1] 7.512562e-55
[1] 3.53086e-54
THERE IS PROBABLY NO SIGNIFICANT DIFFERENCE BETWEEN THE MEANS
THERE IS PROBABLY NO MEAN OF PRODUCT PRICE THAT IS MORE PROFITABLE THAN THE OTHER MEANS
product_rating_hist_1 <- ggplot(ads_1_df_norm_values, aes(x=Product_Rating)) +
geom_histogram(binwidth = 1, color = "white")
product_rating_1 <- ads_1_df_norm_values %>%
group_by(product_rating = ads_1_df_norm_values$Product_Rating) %>%
tally()
ads_1_df_reviewed <- ads_1_df_norm_values[!(ads_1_df_norm_values$Product_Reviews_Count == 0),]
ads_2_df_reviewed <- ads_2_df_norm_values[!(ads_2_df_norm_values$Product_Reviews_Count == 0),]
ads_3_df_reviewed <- ads_3_df_norm_values[!(ads_3_df_norm_values$Product_Reviews_Count == 0),]
product_rating_hist_1_rvd <- ggplot(ads_1_df_reviewed, aes(x=Product_Rating)) +
geom_histogram(binwidth = 0.25, color = "white")
product_rating_hist_2_rvd <- ggplot(ads_2_df_reviewed, aes(x=Product_Rating)) +
geom_histogram(binwidth = 0.25, color = "white")
product_rating_hist_3_rvd <- ggplot(ads_3_df_reviewed, aes(x=Product_Rating)) +
geom_histogram(binwidth = 0.25, color = "white")
rating_normal_1 <- log10(ads_1_df_reviewed$Product_Rating)
rating_normal_2 <- log10(ads_2_df_reviewed$Product_Rating)
rating_normal_3 <- log10(ads_3_df_reviewed$Product_Rating)
ads_1_df_reviewed <- cbind.data.frame(ads_1_df_reviewed, rating_normal_1)
ads_2_df_reviewed <- cbind.data.frame(ads_2_df_reviewed, rating_normal_2)
ads_3_df_reviewed <- cbind.data.frame(ads_3_df_reviewed, rating_normal_3)
ads_1_rating_log_density <- ggdensity(rating_normal_1,fill = "lightgray", title = "Product Rating, Home & Kitchen") +
stat_overlay_normal_density(color = "red", linetype = "dashed")
ads_2_rating_log_density <- ggdensity(rating_normal_2,fill = "lightgray", title = "Product Rating, Health & Household") +
stat_overlay_normal_density(color = "red", linetype = "dashed")
ads_3_rating_log_density <- ggdensity(rating_normal_3,fill = "lightgray", title = "Product Rating, Beauty And Personal Care") +
stat_overlay_normal_density(color = "red", linetype = "dashed")
cor_ads_1 <- cor(ads_1_df_reviewed$Product_Price, ads_1_df_reviewed$Product_Rating)
cor_ads_2 <- cor(ads_2_df_reviewed$Product_Price, ads_2_df_reviewed$Product_Rating)
cor_ads_3 <- cor(ads_3_df_reviewed$Product_Price, ads_3_df_reviewed$Product_Rating)
cor_ads_1_norm <- cor(ads_1_df_reviewed$Product_Price_Norm, ads_1_df_reviewed$rating_normal_1)
cor_ads_2_norm <- cor(ads_2_df_reviewed$Product_Price_Norm, ads_2_df_reviewed$rating_normal_2)
cor_ads_3_norm <- cor(ads_3_df_reviewed$Product_Price_Norm, ads_3_df_reviewed$rating_normal_3)
print(cor_ads_1)
print(cor_ads_1_norm)
print(cor_ads_2)
print(cor_ads_2_norm)
print(cor_ads_3)
print(cor_ads_3_norm)
COR ADS 1 (BOTH REGULAR AND NORMALIZED VALUES)
[1] -0.03021033 [1] -0.01678979
COR ADS 2 (BOTH REGULAR AND NORMALIZED VALUES)
[1] -0.0418366 [1] -0.03605427
COR ADS 3 (BOTH REGULAR AND NORMALIZED VALUES)
[1] -0.02456391 [1] 0.006036672
-WROTE CSVS FOR DF 1,2, & 3 WITHOUT OUTLIERS TO DELIMIT THE BSR VALUES IN EXCEL
write.csv(amazon_dataset_1_removed_outliers, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\Final Project\\ADS_123_CSV\\ads_ros_1.csv", row.names = FALSE)
write.csv(amazon_dataset_2_removed_outliers, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\Final Project\\ADS_123_CSV\\ads_ros_2.csv", row.names = FALSE)
write.csv(amazon_dataset_3_removed_outliers, "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\Final Project\\ADS_123_CSV\\ads_ros_3.csv", row.names = FALSE)
setwd("G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\Final Project\\ADS_123_CSV")
bsr_cleaned_files <- as.vector(list.files(path = "G:\\My Drive\\1_SPRING 2021\\2_STATISTICS AND ANALYSIS\\Final Project\\ADS_123_CSV", pattern = "*.xlsx"))
view(bsr_cleaned_files)
amazon_dataset_1_delim_bsr <- as.data.frame(lapply(bsr_cleaned_files[1], read_xlsx))
amazon_dataset_2_delim_bsr <- as.data.frame(lapply(bsr_cleaned_files[2], read_xlsx))
amazon_dataset_3_delim_bsr <- as.data.frame(lapply(bsr_cleaned_files[3], read_xlsx))
amazon_dataset_1_delim_bsr <- select(amazon_dataset_1_delim_bsr,, -37:-47)
amazon_dataset_2_delim_bsr <- select(amazon_dataset_2_delim_bsr,, -37:-42)
amazon_dataset_3_delim_bsr <- select(amazon_dataset_3_delim_bsr,, -37:-38)
names_adsbrs2 <- names(amazon_dataset_2_delim_bsr)
names_adsbrs2[32] <- "BSR_1_CAT"
setnames(amazon_dataset_2_delim_bsr, old = names(amazon_dataset_2_delim_bsr), new = names_adsbrs2)
amazon_dataset_1_delim_bsr_1st <- select(amazon_dataset_1_delim_bsr,, -31:-36)
amazon_dataset_1_delim_bsr_2nd <- select(amazon_dataset_1_delim_bsr,,-29:-30)
amazon_dataset_1_delim_bsr_2nd <- select(amazon_dataset_1_delim_bsr_2nd,,-31:-34)
amazon_dataset_1_delim_bsr_3rd <- select(amazon_dataset_1_delim_bsr,,-35:-36)
amazon_dataset_1_delim_bsr_3rd <- select(amazon_dataset_1_delim_bsr_3rd,,-29:-32)
amazon_dataset_2_delim_bsr_1st <- select(amazon_dataset_2_delim_bsr,, -33:-36)
amazon_dataset_2_delim_bsr_2nd <- select(amazon_dataset_2_delim_bsr,, -35:-36)
amazon_dataset_2_delim_bsr_2nd <- select(amazon_dataset_2_delim_bsr_2nd,, -31:-32)
amazon_dataset_3_delim_bsr_1st <- select(amazon_dataset_3_delim_bsr,, -33:-36)
amazon_dataset_3_delim_bsr_2nd <- select(amazon_dataset_3_delim_bsr,, -35:-36)
amazon_dataset_3_delim_bsr_2nd <- select(amazon_dataset_3_delim_bsr_2nd,, -31:-32)
amazon_dataset_1_delim_bsr_1st <- drop_na(amazon_dataset_1_delim_bsr_1st, "BSR_1_CAT")
amazon_dataset_1_delim_bsr_2nd <- drop_na(amazon_dataset_1_delim_bsr_2nd, "BSR_1.2_CAT")
amazon_dataset_1_delim_bsr_3rd <- drop_na(amazon_dataset_1_delim_bsr_3rd, "BSR_2_CAT")
amazon_dataset_2_delim_bsr_1st <- drop_na(amazon_dataset_2_delim_bsr_1st, "BSR_1_CAT")
amazon_dataset_2_delim_bsr_2nd <- drop_na(amazon_dataset_2_delim_bsr_2nd, "BSR_2_CAT")
amazon_dataset_3_delim_bsr_1st <- drop_na(amazon_dataset_3_delim_bsr_1st, "BSR_1_CAT")
amazon_dataset_3_delim_bsr_2nd <- drop_na(amazon_dataset_3_delim_bsr_2nd, "BSR_2_CAT")
product_rank_hist <- ggplot(amazon_dataset_1_delim_bsr_1st, aes(x=BSR_1_VALUE)) +
geom_histogram(binwidth = 100000, color = "white")
print(product_rank_hist)
product_rank_dot_2_1 <- plot(Product_Price ~ BSR_1_VALUE, data = amazon_dataset_2_delim_bsr_1st)
coe_2_1 <- cor(amazon_dataset_2_delim_bsr_1st$Product_Price, amazon_dataset_2_delim_bsr_1st$BSR_1_VALUE)
print(coe_2_1)
[1] 0.1473879
NO/NEGLIGIBLE RELATIONSHIP BETWEEN PRODUCT PRICE AND PRODUCT RANK IN DF1
product_rank_hist_2_1 <- ggplot(amazon_dataset_2_delim_bsr_1st, aes(x=BSR_1_VALUE)) +
geom_histogram(binwidth = 100000, color = "white")
print(product_rank_hist_2_1)
product_rank_dot_2_1 <- plot(Product_Price ~ BSR_1_VALUE, data = amazon_dataset_2_delim_bsr_1st)
coe_2_1 <- cor(amazon_dataset_2_delim_bsr_1st$Product_Price, amazon_dataset_2_delim_bsr_1st$BSR_1_VALUE)
print(coe_2_1)
[1] 0.1016072
NO/NEGLIGIBLE RELATIONSHIP BETWEEN PRODUCT PRICE AND PRODUCT RANK IN DF2
amazon_dataset_1_delim_bsr_rating <- drop_na(amazon_dataset_1_delim_bsr_1st, "Product_Reviews_Count")
amazon_dataset_1_delim_bsr_rating <- amazon_dataset_1_delim_bsr_rating[which(amazon_dataset_1_delim_bsr_rating$Product_Reviews_Count > 0),]
plot(Product_Rating ~ BSR_1_VALUE, data = amazon_dataset_1_delim_bsr_rating)
coe_rating_rank <- cor(amazon_dataset_1_delim_bsr_rating$Product_Rating, amazon_dataset_1_delim_bsr_rating$BSR_1_VALUE)
print(coe_rating_rank)
[1] -0.1931606
THE RELATIONSHIP BETWEEN PRODUCT RATING AND PRODUCT RANK IS STATISTICALLY NEGLIGIBLE, BUT OUR DATASET MAY NOT BE LARGE ENOUGH
ACCORDING TO VARIOUS SOURCES, THERE ARE OVER 20 FACTORS THAT GO INTO PRODUCT RANKING, INCLUDING RATINGS, SALES, ANSWERED QUESTIONS, ETC.