In [ ]:
Copied!
# Display R version and information
R.version.string
sessionInfo()
# Display R version and information
R.version.string
sessionInfo()
R Features¶
R is excellent for:
- Statistical Analysis
- Data Visualization
- Machine Learning
- Bioinformatics
- Academic Research
In [ ]:
Copied!
# Example: Basic data structures in R
# Create a data frame
languages <- data.frame(
name = c("Python", "R", "Julia", "Ruby"),
type = c("General", "Statistical", "Scientific", "General"),
year_created = c(1991, 1993, 2012, 1995),
data_science_score = c(9, 10, 8, 5),
stringsAsFactors = FALSE
)
# Display the data frame
print("Programming Languages for Data Science:")
languages
# Example: Basic data structures in R
# Create a data frame
languages <- data.frame(
name = c("Python", "R", "Julia", "Ruby"),
type = c("General", "Statistical", "Scientific", "General"),
year_created = c(1991, 1993, 2012, 1995),
data_science_score = c(9, 10, 8, 5),
stringsAsFactors = FALSE
)
# Display the data frame
print("Programming Languages for Data Science:")
languages
In [ ]:
Copied!
# Statistical summary
summary(languages$data_science_score)
# Calculate age of languages
current_year <- as.numeric(format(Sys.Date(), "%Y"))
languages$age <- current_year - languages$year_created
print("\nLanguages with age:")
languages[order(languages$age, decreasing = TRUE), ]
# Statistical summary
summary(languages$data_science_score)
# Calculate age of languages
current_year <- as.numeric(format(Sys.Date(), "%Y"))
languages$age <- current_year - languages$year_created
print("\nLanguages with age:")
languages[order(languages$age, decreasing = TRUE), ]
Data Visualization with Base R¶
In [ ]:
Copied!
# Create visualizations
# Bar plot
barplot(languages$data_science_score,
names.arg = languages$name,
main = "Data Science Scores by Language",
ylab = "Score",
col = c("#3776ab", "#276DC3", "#9558B2", "#CC342D"),
ylim = c(0, 12))
# Add value labels
text(x = 1:4 * 1.2 - 0.5,
y = languages$data_science_score + 0.5,
labels = languages$data_science_score)
# Create visualizations
# Bar plot
barplot(languages$data_science_score,
names.arg = languages$name,
main = "Data Science Scores by Language",
ylab = "Score",
col = c("#3776ab", "#276DC3", "#9558B2", "#CC342D"),
ylim = c(0, 12))
# Add value labels
text(x = 1:4 * 1.2 - 0.5,
y = languages$data_science_score + 0.5,
labels = languages$data_science_score)
In [ ]:
Copied!
# Statistical analysis example
# Generate sample data
set.seed(42)
n <- 100
x <- rnorm(n, mean = 50, sd = 10)
y <- 2 * x + rnorm(n, mean = 0, sd = 5)
# Create scatter plot with regression line
plot(x, y,
main = "Linear Regression Example",
xlab = "X Variable",
ylab = "Y Variable",
pch = 19,
col = rgb(0, 0, 1, 0.5))
# Fit linear model
model <- lm(y ~ x)
abline(model, col = "red", lwd = 2)
# Add R-squared to plot
r_squared <- round(summary(model)$r.squared, 3)
text(40, 120, paste("R² =", r_squared), pos = 4)
# Statistical analysis example
# Generate sample data
set.seed(42)
n <- 100
x <- rnorm(n, mean = 50, sd = 10)
y <- 2 * x + rnorm(n, mean = 0, sd = 5)
# Create scatter plot with regression line
plot(x, y,
main = "Linear Regression Example",
xlab = "X Variable",
ylab = "Y Variable",
pch = 19,
col = rgb(0, 0, 1, 0.5))
# Fit linear model
model <- lm(y ~ x)
abline(model, col = "red", lwd = 2)
# Add R-squared to plot
r_squared <- round(summary(model)$r.squared, 3)
text(40, 120, paste("R² =", r_squared), pos = 4)
Advanced Statistical Analysis¶
In [ ]:
Copied!
# ANOVA example
# Create sample data for three groups
set.seed(123)
group_a <- rnorm(30, mean = 100, sd = 15)
group_b <- rnorm(30, mean = 110, sd = 15)
group_c <- rnorm(30, mean = 105, sd = 15)
# Combine into a data frame
experiment_data <- data.frame(
value = c(group_a, group_b, group_c),
group = factor(rep(c("A", "B", "C"), each = 30))
)
# Perform ANOVA
anova_result <- aov(value ~ group, data = experiment_data)
summary(anova_result)
# Box plot
boxplot(value ~ group, data = experiment_data,
main = "Group Comparison",
xlab = "Group",
ylab = "Value",
col = c("lightblue", "lightgreen", "lightcoral"))
# ANOVA example
# Create sample data for three groups
set.seed(123)
group_a <- rnorm(30, mean = 100, sd = 15)
group_b <- rnorm(30, mean = 110, sd = 15)
group_c <- rnorm(30, mean = 105, sd = 15)
# Combine into a data frame
experiment_data <- data.frame(
value = c(group_a, group_b, group_c),
group = factor(rep(c("A", "B", "C"), each = 30))
)
# Perform ANOVA
anova_result <- aov(value ~ group, data = experiment_data)
summary(anova_result)
# Box plot
boxplot(value ~ group, data = experiment_data,
main = "Group Comparison",
xlab = "Group",
ylab = "Value",
col = c("lightblue", "lightgreen", "lightcoral"))
In [ ]:
Copied!
# Time series analysis example
# Generate time series data
dates <- seq(as.Date("2023-01-01"), by = "day", length.out = 365)
trend <- seq(100, 150, length.out = 365)
seasonal <- 10 * sin(2 * pi * (1:365) / 365)
noise <- rnorm(365, mean = 0, sd = 5)
values <- trend + seasonal + noise
# Create time series plot
plot(dates, values, type = "l",
main = "Time Series Analysis",
xlab = "Date",
ylab = "Value",
col = "blue")
# Add trend line
lines(dates, trend, col = "red", lwd = 2)
# Add legend
legend("topleft",
legend = c("Observed", "Trend"),
col = c("blue", "red"),
lty = 1,
lwd = c(1, 2))
# Time series analysis example
# Generate time series data
dates <- seq(as.Date("2023-01-01"), by = "day", length.out = 365)
trend <- seq(100, 150, length.out = 365)
seasonal <- 10 * sin(2 * pi * (1:365) / 365)
noise <- rnorm(365, mean = 0, sd = 5)
values <- trend + seasonal + noise
# Create time series plot
plot(dates, values, type = "l",
main = "Time Series Analysis",
xlab = "Date",
ylab = "Value",
col = "blue")
# Add trend line
lines(dates, trend, col = "red", lwd = 2)
# Add legend
legend("topleft",
legend = c("Observed", "Trend"),
col = c("blue", "red"),
lty = 1,
lwd = c(1, 2))
R Packages and Functions¶
In [ ]:
Copied!
# Custom functions
calculate_stats <- function(x) {
stats <- list(
mean = mean(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE),
n = length(x[!is.na(x)])
)
return(stats)
}
# Test the function
test_data <- c(23, 45, 67, 89, 12, 34, 56, 78, 90, NA, 43)
results <- calculate_stats(test_data)
print("Statistical Summary:")
for (name in names(results)) {
cat(sprintf("%-10s: %.2f\n", name, results[[name]]))
}
# Custom functions
calculate_stats <- function(x) {
stats <- list(
mean = mean(x, na.rm = TRUE),
median = median(x, na.rm = TRUE),
sd = sd(x, na.rm = TRUE),
min = min(x, na.rm = TRUE),
max = max(x, na.rm = TRUE),
n = length(x[!is.na(x)])
)
return(stats)
}
# Test the function
test_data <- c(23, 45, 67, 89, 12, 34, 56, 78, 90, NA, 43)
results <- calculate_stats(test_data)
print("Statistical Summary:")
for (name in names(results)) {
cat(sprintf("%-10s: %.2f\n", name, results[[name]]))
}
Installing R Packages¶
You can install R packages directly from a notebook cell:
In [ ]:
Copied!
# Example of installing packages (commented out to avoid actual installation)
# install.packages("ggplot2")
# install.packages("dplyr")
# install.packages("tidyr")
# install.packages("caret")
print("Use install.packages('package_name') to install R packages")
print("Popular R packages for data science:")
print("- ggplot2: Advanced data visualization")
print("- dplyr: Data manipulation")
print("- tidyr: Data tidying")
print("- caret: Machine learning")
# Example of installing packages (commented out to avoid actual installation)
# install.packages("ggplot2")
# install.packages("dplyr")
# install.packages("tidyr")
# install.packages("caret")
print("Use install.packages('package_name') to install R packages")
print("Popular R packages for data science:")
print("- ggplot2: Advanced data visualization")
print("- dplyr: Data manipulation")
print("- tidyr: Data tidying")
print("- caret: Machine learning")
Summary¶
The R kernel (IRkernel) provides:
- Comprehensive statistical computing capabilities
- Excellent built-in visualization functions
- Extensive package ecosystem (CRAN)
- Specialized tools for various domains
- Strong academic and research community
R in Jupyter is ideal for:
- Statistical analysis and modeling
- Data visualization and exploration
- Academic research and publication
- Bioinformatics and genomics
- Time series analysis