#data() # checking
data(Orange) # loading
Intro to R-2
Important functions
ls() # list all the variables in the environment
rm(x) # can be used to remove specific var
#rm(list = ls()) # removes everything
Inbuild datasets in R
View(Orange) # opens a viewing window
head(Orange) # prints out first six rows on your console
Tree age circumference
1 1 118 30
2 1 484 58
3 1 664 87
4 1 1004 115
5 1 1231 120
6 1 1372 142
tail(Orange) # prints out last six rows on your console
Tree age circumference
30 5 484 49
31 5 664 81
32 5 1004 125
33 5 1231 142
34 5 1372 174
35 5 1582 177
str(Orange) # tells you about the structure of the data
Classes 'nfnGroupedData', 'nfGroupedData', 'groupedData' and 'data.frame': 35 obs. of 3 variables:
$ Tree : Ord.factor w/ 5 levels "3"<"1"<"5"<"2"<..: 2 2 2 2 2 2 2 4 4 4 ...
$ age : num 118 484 664 1004 1231 ...
$ circumference: num 30 58 87 115 120 142 145 33 69 111 ...
- attr(*, "formula")=Class 'formula' language circumference ~ age | Tree
.. ..- attr(*, ".Environment")=<environment: R_EmptyEnv>
- attr(*, "labels")=List of 2
..$ x: chr "Time since December 31, 1968"
..$ y: chr "Trunk circumference"
- attr(*, "units")=List of 2
..$ x: chr "(days)"
..$ y: chr "(mm)"
dim(Orange) # dimensions of the data - in terms of rows and columns
[1] 35 3
Data frames
A data frame is a rectangular collection of values, usually organized so that variables appear in the columns and observations appear in rows.
Creating a data frame
# Creating a new data frame that has 3 vectors of different types
<- data.frame(name = c("Anna", "Bob", "Chris"),
dat.new language = c("R", "C", "Java"),
year = c(4, 5, 9))
# View the dataframe
head(dat.new)
name language year
1 Anna R 4
2 Bob C 5
3 Chris Java 9
Reading and writing a dataframe to a file
# This will add additional row names
write.csv(dat.new,file = "my_data.csv")
read.csv("my_data.csv")
X name language year
1 1 Anna R 4
2 2 Bob C 5
3 3 Chris Java 9
# To avoid adding row names to the data, use row.names=FALSE
write.csv(dat.new,file = "my_data.csv", row.names = FALSE)
read.csv("my_data.csv") # reads into the console
name language year
1 Anna R 4
2 Bob C 5
3 Chris Java 9
# To save the data into a variable:
<- read.csv("my_data.csv") dat.new2
Working with data frames
<- read.csv("my_data.csv")
dat.new2
# Note that you should have your entire path for the data under the quotes
# if your data is not saved in the working directory
# names of the data
names(dat.new2)
[1] "name" "language" "year"
# column names
colnames(dat.new2)
[1] "name" "language" "year"
# rownames
rownames(dat.new2)
[1] "1" "2" "3"
# type of data
class(dat.new2)
[1] "data.frame"
# structure of data
str(dat.new2)
'data.frame': 3 obs. of 3 variables:
$ name : chr "Anna" "Bob" "Chris"
$ language: chr "R" "C" "Java"
$ year : int 4 5 9
Accessing values in a dataframe
# accessing a vector in the dataframe
$name dat.new2
[1] "Anna" "Bob" "Chris"
# accessing a second element of a vector in the dataframe
$name[2] dat.new2
[1] "Bob"
# accessing the language in the dataframe
$language dat.new2
[1] "R" "C" "Java"
# accessing a third element of a vector in the dataframe
$language[3] dat.new2
[1] "Java"
# Removing rows
-2,] dat.new2[
name language year
1 Anna R 4
3 Chris Java 9
# Removing columns
-1] dat.new2[,
language year
1 R 4
2 C 5
3 Java 9
# Appending to a data frame
<- data.frame(name = c("Diego", "Evan", "Felicia", "George"), language = c("Python", "Perl", "C++","HTML"),
dat.new3year = c(4, 5, 9,10))
# Binding by row
rbind(dat.new2,dat.new3)
name language year
1 Anna R 4
2 Bob C 5
3 Chris Java 9
4 Diego Python 4
5 Evan Perl 5
6 Felicia C++ 9
7 George HTML 10
<-rbind(dat.new2,dat.new3)
newdata
# Binding by column
# cbind(dat.new2,dat.new3) # will give you error
## why this error? Because to bind by column, the number of rows should be equal
<-dat.new3[-3,] # removing a row from 3rd data frame and adding to a new data frame
dat.new4cbind(dat.new4,dat.new2)
name language year name language year
1 Diego Python 4 Anna R 4
2 Evan Perl 5 Bob C 5
4 George HTML 10 Chris Java 9
# Notice that R doesn't care about the content but the structure of data
Subsetting dataframes
# The notation is nameofdata[row, column]
4,2] newdata[
[1] "Python"
# accessing more than one value
1:4,] # gets you first four rows and all the columns newdata[
name language year
1 Anna R 4
2 Bob C 5
3 Chris Java 9
4 Diego Python 4
1:2] # all the rows and first two columns newdata[,
name language
1 Anna R
2 Bob C
3 Chris Java
4 Diego Python
5 Evan Perl
6 Felicia C++
7 George HTML
# Extracting specific information:
c(1,3),] # Gets you first and third row and all columns newdata[
name language year
1 Anna R 4
3 Chris Java 9
c(5,4),c(1,2)] # Guess newdata[
name language
5 Evan Perl
4 Diego Python
## Tip:
# Remember that the order of extraction matters.
## Accessing by name
"name"] newdata[
name
1 Anna
2 Bob
3 Chris
4 Diego
5 Evan
6 Felicia
7 George
"year"] newdata[
year
1 4
2 5
3 9
4 4
5 5
6 9
7 10
## Operator
$name newdata
[1] "Anna" "Bob" "Chris" "Diego" "Evan" "Felicia" "George"
Matrices
In R, a matrix is a collection of elements of the same data type (numeric, character, or logical) arranged into a fixed number of rows and columns. A matrix is two-dimensional, since we are only working with rows and columns.
# Creating a new empty matrix
<- matrix(0, ncol=8, nrow=4)
mat.example
# viewing the matrix
View(mat.example)
# class
class(mat.example)
[1] "matrix" "array"
# rows and columns
dim(mat.example)
[1] 4 8
# accessing column number
ncol(mat.example)
[1] 8
# accessing row numbers
nrow(mat.example)
[1] 4
Creating data using rnorm function
# Creating a new matrix with data:
# Using rnorm to generate 20 random numbers
<-rnorm(20)
mat.dat
# Note that the random numbers will be different every time you run this code,
# Note check ?rnorm ?pnorm ?dnorm for more information on using probability distribution
mat.dat
[1] 1.29609747 1.10365481 -2.02880832 -1.29160212 -0.37954786 0.42798445
[7] -0.21279220 -0.05206884 -0.12243237 -1.01648228 -0.83820196 0.98699999
[13] -0.55725134 1.19378723 -0.25376620 1.50198228 -1.57086939 1.42074561
[19] 1.16966486 0.30449234
# populating the matrix by row
<-matrix(data = mat.dat, nrow = 4, ncol = 5, byrow = TRUE) mat.example2
Subsetting Matrices
# Subsetting rows and columns
2,4] mat.example2[
[1] -0.1224324
Lists in R
Lists are the R objects and contain elements of different types, For example a list can have numbers, characters, strings, vectors, matrix and another list inside it. A list can also have a function as one of its elements.
<-list("Red","Green", c(21,34,22), TRUE, 52.53, 193.8)
list_data list_data
[[1]]
[1] "Red"
[[2]]
[1] "Green"
[[3]]
[1] 21 34 22
[[4]]
[1] TRUE
[[5]]
[1] 52.53
[[6]]
[1] 193.8
<-list(dat.new2, mat.example2)
list_data_new list_data_new
[[1]]
name language year
1 Anna R 4
2 Bob C 5
3 Chris Java 9
[[2]]
[,1] [,2] [,3] [,4] [,5]
[1,] 1.2960975 1.1036548 -2.02880832 -1.2916021 -0.3795479
[2,] 0.4279845 -0.2127922 -0.05206884 -0.1224324 -1.0164823
[3,] -0.8382020 0.9870000 -0.55725134 1.1937872 -0.2537662
[4,] 1.5019823 -1.5708694 1.42074561 1.1696649 0.3044923
Subsetting lists
3]] list_data[[
[1] 21 34 22
2]] list_data_new[[
[,1] [,2] [,3] [,4] [,5]
[1,] 1.2960975 1.1036548 -2.02880832 -1.2916021 -0.3795479
[2,] 0.4279845 -0.2127922 -0.05206884 -0.1224324 -1.0164823
[3,] -0.8382020 0.9870000 -0.55725134 1.1937872 -0.2537662
[4,] 1.5019823 -1.5708694 1.42074561 1.1696649 0.3044923
Bonus activity - Plotting in R (using base R)
Base R can be used to create simple plots.
#--------Basic scatterplot
# Create data for plot
<- seq(1,100, by=3)
x<- x/2
y
# Basic x and y plot
plot(x,y)
# Add title
plot(x,y,main="Title of plot")
# Add x and y labels
plot(x,y,main="Title of plot",xlab = "This is x axis label",
ylab="This is y label")
# Change color
plot(x,y,main="Title of plot",xlab = "This is x axis label",
ylab="This is y label",col="blue")
Histograms
# rnorm generates a vector of normally distributed random numbers.
<-rnorm(1000, mean = 0, sd = 1)
xhist(x)
hist(x, breaks = 10)
hist(x, breaks = 100)
hist(x, breaks = 100, col = "orange", main = "This is title",
xlab = "This is x axis", ylab = "This is y axis")
Questions from the workshop
Q1: How to sort dataframes?
# Use sort function to sort a vector
<-c(1:10,8.5,7.6,2.5,1.3,2.4)
xsort(x)
[1] 1.0 1.3 2.0 2.4 2.5 3.0 4.0 5.0 6.0 7.0 7.6 8.0 8.5 9.0 10.0
# For sorting dataframes using the function order
order(newdata$year, decreasing = TRUE), ] newdata[
name language year
7 George HTML 10
3 Chris Java 9
6 Felicia C++ 9
2 Bob C 5
5 Evan Perl 5
1 Anna R 4
4 Diego Python 4
# Notice that the above function is used at the row position since we are sorting by row
# Another example:
<-data.frame(col1=c(31:40),col2=c(seq(1,30,by=3)))
newdata2 newdata2
col1 col2
1 31 1
2 32 4
3 33 7
4 34 10
5 35 13
6 36 16
7 37 19
8 38 22
9 39 25
10 40 28
order(newdata2$col1, decreasing = TRUE),] newdata2[
col1 col2
10 40 28
9 39 25
8 38 22
7 37 19
6 36 16
5 35 13
4 34 10
3 33 7
2 32 4
1 31 1
Q2: How to increase the number of rows displaying on console?
# Using the head command, where n is followed by number of rows
head(newdata2) # normally yields the first 6 rows
col1 col2
1 31 1
2 32 4
3 33 7
4 34 10
5 35 13
6 36 16
head(newdata2,n=9)# would yield the first 9 rows
col1 col2
1 31 1
2 32 4
3 33 7
4 34 10
5 35 13
6 36 16
7 37 19
8 38 22
9 39 25
Q3: Can I use a graph in list?
# Answer: Not as an image but you can add a figure/graph as an object
# For example:
<-hist(rnorm(1000)) h1
<-hist(rnorm(1000))
h2<-list(h1,h2)
list.example# The above example won't save the graph but the attributes
list.example
[[1]]
$breaks
[1] -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5
$counts
[1] 4 3 18 45 92 167 167 193 129 110 42 23 6 1
$density
[1] 0.008 0.006 0.036 0.090 0.184 0.334 0.334 0.386 0.258 0.220 0.084 0.046
[13] 0.012 0.002
$mids
[1] -3.25 -2.75 -2.25 -1.75 -1.25 -0.75 -0.25 0.25 0.75 1.25 1.75 2.25
[13] 2.75 3.25
$xname
[1] "rnorm(1000)"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
[[2]]
$breaks
[1] -3.5 -3.0 -2.5 -2.0 -1.5 -1.0 -0.5 0.0 0.5 1.0 1.5 2.0 2.5 3.0 3.5
$counts
[1] 2 6 15 38 74 148 201 217 136 94 45 17 6 1
$density
[1] 0.004 0.012 0.030 0.076 0.148 0.296 0.402 0.434 0.272 0.188 0.090 0.034
[13] 0.012 0.002
$mids
[1] -3.25 -2.75 -2.25 -1.75 -1.25 -0.75 -0.25 0.25 0.75 1.25 1.75 2.25
[13] 2.75 3.25
$xname
[1] "rnorm(1000)"
$equidist
[1] TRUE
attr(,"class")
[1] "histogram"
# Using ggplot2 you can save the graph information as well
library(ggplot2)
Warning: package 'ggplot2' was built under R version 4.2.3
<- data.frame(x = rnorm(10), y = rnorm(10))
df
<- ggplot(df, aes(x, y)) + geom_point() + ggtitle("Graph 1")
fig1 <- ggplot(df, aes(x, y)) + geom_point() + ggtitle("Graph 2")
fig2
<- list(fig1,fig2)
figlist figlist
[[1]]
[[2]]
Q4: Difference between data.frame and data.table?
The stackoverflow post provided an excellent answer of the syntax differences between data frame and data table, so I highly encourage folks to read the information on this link along with the beginner’s FAQs for data.table here