Carbon Emissions

Kaggle now contains some interesting data sets, one such is Carbon Emissions. It contains carbon dioxide emissions from electricity generation broken down by fuel type such as coal and electricity.

I have started with the basic exploratory analysis after cleaning up the data. Here is the box plot of various fuel types spread across the years:

facet-category-plot

I will keep adding more to it.

Here is the code:

---
title: "Exploratory Analysis"
author: "Pattern Project"
date: "09 November 2016"
output:
html_document:
fig_width: 10
fig_height: 10
theme: spacelab
highlight: kate
---

## `````````````````````````````````````````````
#### Read Me ####
## `````````````````````````````````````````````
## Version Log:
## v 0 0 1 : Read Loop. Basic Graph
## v 0 0 2 : Why 13 Observations for a year, and not 12 (They had included an yearly aggregate, also description col shows the type of emission, which was excluded earlier)
## v 0 0 3 : Basic Graph now for 12 values / year
## v 0 0 4 : Graph 2 with lines connecting the year
## v 0 0 5 : Graph 3, histogram, box plot
## v 0 0 6 : Graph 4, now a loop of box plot for varoius categories
## v 0 0 7 : Box Plot of all catgories
## v 0 0 8 :
## v 0 0 9 :

## TODO:
# http://www.stat.pitt.edu/stoffer/tsa4/R_toot.htm


## `````````````````````````````````````````````

Load Libraries
```{r, message = F, warning = F}
## `````````````````````````````````````````````
#### Load Libraries ####
## `````````````````````````````````````````````
library(dplyr) # for df maninpulation
library(readr) # for file I/O
library(purrr) # for map functions
library(stringr) # for string functions
library(tidyr) # for melt functions
library(lubridate) # for date/times.
#library(anytime) # for date/times.
library(ggplot2) # for plots
library(forcats) # for factors
library(viridis) # for color palette
library(ggthemes) # clean theme for ggplot2
library(scales) # for plot label formatting
library(gridExtra) # for arranging individual ggplot objects
library(DT) # for data.frame output
library(knitr)
library(grid)
library(RColorBrewer)
## `````````````````````````````````````````````
```


Helper Functions
```{r}

## `````````````````````````````````````````````
#### Helper Functions ####
## `````````````````````````````````````````````

# src:
# http://stackoverflow.com/questions/8425409/file-path-issues-in-r-using-windows-hex-digits-in-character-string-error

# Simply copy the path to your clipboard (ctrl + c) and then run the function as pathPrep()

pathPrep <- function(path = "clipboard") {
y <- if (path == "clipboard") {
readClipboard()
} else {
cat("Please enter the path:\n\n")
readline()
}
x <- chartr("\\", "/", y)
writeClipboard(x)
return(x)
}

# SRC:
# http://minimaxir.com/2015/02/ggplot-tutorial/

fte_theme <- function() {

# Generate the colors for the chart procedurally with RColorBrewer
palette <- brewer.pal("Greys", n=9)
color.background = palette[2]
color.grid.major = palette[3]
color.axis.text = palette[6]
color.axis.title = palette[7]
color.title = palette[9]

# Begin construction of chart
theme_bw(base_size=9) +

# Set the entire chart region to a light gray color
theme(panel.background=element_rect(fill=color.background, color=color.background)) +
theme(plot.background=element_rect(fill=color.background, color=color.background)) +
theme(panel.border=element_rect(color=color.background)) +

# Format the grid
theme(panel.grid.major=element_line(color=color.grid.major,size=.25)) +
theme(panel.grid.minor=element_blank()) +
theme(axis.ticks=element_blank()) +

# Format the legend, but hide by default
#theme(legend.position="none") +
theme(legend.background = element_rect(fill=color.background)) +
theme(legend.text = element_text(size=7,color=color.axis.title)) +

# Set title and axis labels, and format these and tick marks
theme(plot.title=element_text(color=color.title, size=10, vjust=1.25)) +
theme(axis.text.x=element_text(size=7,color=color.axis.text)) +
theme(axis.text.y=element_text(size=7,color=color.axis.text)) +
theme(axis.title.x=element_text(size=8,color=color.axis.title, vjust=0)) +
theme(axis.title.y=element_text(size=8,color=color.axis.title, vjust=1.25)) +

# Plot margins
theme(plot.margin = unit(c(0.35, 0.2, 0.3, 0.35), "cm"))
}

# Multip Plot Function
# SRC:
# http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/

# Multiple plot function
#
# ggplot objects can be passed in ..., or to plotlist (as a list of ggplot objects)
# - cols: Number of columns in layout
# - layout: A matrix specifying the layout. If present, 'cols' is ignored.
#
# If the layout is something like matrix(c(1,2,3,3), nrow=2, byrow=TRUE),
# then plot 1 will go in the upper left, 2 will go in the upper right, and
# 3 will go all the way across the bottom.
#
multiplot <- function(..., plotlist=NULL, file, cols=1, layout=NULL) {
library(grid)

# Make a list from the ... arguments and plotlist
plots <- c(list(...), plotlist)

numPlots = length(plots)

# If layout is NULL, then use 'cols' to determine layout
if (is.null(layout)) {
# Make the panel
# ncol: Number of columns of plots
# nrow: Number of rows needed, calculated from # of cols
layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
ncol = cols, nrow = ceiling(numPlots/cols))
}

if (numPlots==1) {
print(plots[[1]])

} else {
# Set up the page
grid.newpage()
pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))

# Make each plot, in the correct location
for (i in 1:numPlots) {
# Get the i,j matrix positions of the regions that contain this subplot
matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))

print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
layout.pos.col = matchidx$col))
}
}
}


## `````````````````````````````````````````````

```

Read Input Data
```{r}
# Read the data
flag_local = 1

# for local use
if(flag_local == 1)
{
#print("if")

# this is for this chunk only
setwd("D:/2. Bianca/1. Perso/13. Kaggle/6. d - Carbon Emissions")
# for notebook we need to use the following
# https://github.com/yihui/knitr/issues/277
opts_knit$set(root.dir = 'D:/2. Bianca/1. Perso/13. Kaggle/6. d - Carbon Emissions')

# creating the file path for zip files
ch.zip.path = file.path(getwd(),"2. Data")

# extracting all csv files
ch.csv.files <-
list.files(path=ch.zip.path,pattern = "\\.csv$", full.names = TRUE)


df.master <- read.csv(
ch.csv.files,
header = TRUE,
stringsAsFactors = FALSE,
# containts a lot of values with "Not Available"
na.strings = c("", "NA", "Not Available")
)


} else {
# else requires identation
# http://stackoverflow.com/questions/14865435/unexpected-else-in-else-error
#print("else")

df.master <- read.csv(
"../input/MER_T12_06.csv",
header = TRUE,
stringsAsFactors=FALSE,
# containts a lot of values with "Not Available"
na.strings = c("", "NA", "Not Available")
)

}

# converting to tibble
t.master = as_data_frame(df.master)
rm(df.master)

# convert the names to lowercase
names(t.master) <- tolower(names(t.master))

# have a look
t.master %>% glimpse()

# cols to keep
ch.keep = c("yyyymm","value", "description")

t.1 <-
t.master %>%
select(one_of(ch.keep))

t.1

# clean up
#rm(t.master)
rm(ch.csv.files)
rm(ch.keep)
rm(ch.zip.path)
```

Input Conversion Issue
```{r}
##
# debugging the conversion to numeric issue
# the problematic marked rows had value "Not Available" instead of NA
# fixed in the read.csv by adding na.strings

t.1 %>% filter(is.na(value))

t.1$value[1000:2000] %>% as.numeric()
t.1$value[2000:3000] %>% as.numeric()
t.1$value[3000:4000] %>% as.numeric() # problematic
t.1$value[4000:5000] %>% as.numeric()

t.1$value[3000:3500] %>% as.numeric() # problematic
t.1$value[3390:3500] %>% as.numeric() # problematic
```


Basic Manipulation
```{r}
t.1 <- t.1 %>%
# remove any na values
na.omit() %>%
# seperate out year and month
mutate(
dummy = as.character(yyyymm),
year = substr(dummy, 0, 4),
year = as.factor(year),
month = substr(dummy, 5, 6),
month = as.factor(month),
value = as.numeric(value)
) %>%
select(year, month, value, description, -dummy, -yyyymm)

#t.1 %>% filter(is.na(value))

# removing the 13th aggregate value
t.1 <-
t.1 %>%
filter(! month == 13)

# drop labels
#t.1$month %>% droplevels.factor() %>% glimpse()
t.1$month <-
t.1$month %>% droplevels.factor()

# convert month numbers to names, using a built-in constant:
levels(t.1$month) <- month.abb

t.1 %>% glimpse()

t.1
```
Digging into the Description col
```{r}
t.1 %>%
filter(year %in% c(1973:1973)) %>%
select(description) %>%
unique()
```
Making the Description Col Shotter
```{r}
t.1 <-
t.1 %>%
mutate(
description2 = recode(
.$description,
"Coal Electric Power Sector CO2 Emissions" = "coal",
"Natural Gas Electric Power Sector CO2 Emissions" = "natural gas",
"Distillate Fuel, Including Kerosene-Type Jet Fuel, Oil Electric Power Sector CO2 Emissions" = "distillate fuel",
"Petroleum Coke Electric Power Sector CO2 Emissions" = "petroleum coke",
"Residual Fuel Oil Electric Power Sector CO2 Emissions" = "residual fuel",
"Petroleum Electric Power Sector CO2 Emissions" = "petroleum electric power",
"Total Energy Electric Power Sector CO2 Emissions" = "total energy"
)
)

t.1 <-
t.1 %>%
select(-description) %>%
rename(description = description2) %>%
mutate(description = as.factor(description))

t.1
```



Adding a data column
```{r}
# adding a data column
t.1 <-
t.1 %>%
mutate(date = paste("01",month,year) %>% as.Date("%d %b %Y"))

```

Basic Line Plot Showing trend across the years from 1973 to 1974
```{r}
t.2 <-
t.1 %>%
filter(!month == "13") %>%
filter(year %in% c(1973:1980))

g.1 <- ggplot(t.2, aes(x=month, y=value, group=year))
g.1 <- g.1 + geom_point(aes(colour = year)) + geom_line(aes(colour = year))
g.1 <- g.1 + facet_wrap(~ description, ncol=2)
g.1 <- g.1 + fte_theme()
g.1 <- g.1 + labs(title = "Carbon Emissions from Elecricity Generation, Across the Years", x = "Months", y = "Value (in Million Metric Tons of Carbon Dioxide")
g.1
```

Histogram of Values from 1973 to 1980
```{r}
g.2 <- ggplot(t.2 %>% subset(year %in% c(1973:1980)), aes(x=value))
g.2 <- g.2 + geom_histogram(
aes(fill = year),
#fill = "white",
binwidth = 0.5,
alpha = 0.5,
position = "identity"
)
g.2 <- g.2 + facet_wrap(~ description, ncol=2)
g.2 <- g.2 + fte_theme()
g.2
```

Box Plot of "Coal"
```{r}
g.3 <- ggplot(t.1 %>% subset(description %in% c("coal")))
g.3 <- g.3 + geom_boxplot(
aes(x=year, y=value, fill=year)
)
g.3 <- g.3 + fte_theme()
g.3 <- g.3 + theme(axis.text.x = element_text(angle = 90, hjust = 1))
g.3 <- g.3 + theme(legend.position="none")
g.3
```
Looping over different categories / descriptions
```{r}
plots <- t.1 %>%
split(.$description) %>%
map(~ ggplot(.) +
geom_boxplot(aes(x = year, y = value, fill = year)) +
fte_theme() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
theme(legend.position="none") +
labs(title = .$description)
) #%>%
#walk(print)

# fancy printing
multiplot(plotlist = plots, cols = 2)

```


# Fin

Thanks for reading.

 

 

Advertisements