课程报告复现第一期

第一期 数据分析作业复现

1
2
3
4
5
6
7
8
# R代码完整解释

## 加载必要的库

```r
library(readxl)
library(dplyr)
library(ggplot2)

这里加载了三个R包:

  • readxl: 用于读取Excel文件
  • dplyr: 用于数据处理和操作
  • ggplot2: 用于数据可视化

读取数据

1
2
3
4
5
sheet1 <- read_excel("data/tmp001.xls")
sheet2 <- read_excel("data/tmp002.xls")
sheet3 <- read_excel("data/tmp003.xls")
sheet4 <- read_excel("data/tmp004.xls")
sheet5 <- read_excel("data/tmp005.xls")

从5个Excel文件中读取数据,每个文件对应一个数据表。

1
loandata <- bind_rows(sheet1, sheet2, sheet3, sheet4, sheet5)

将5个数据表合并成一个大的数据框loandata

数据处理

1
df1 <- filter(loandata, FSTR_LOOPGROUPID == "NHNX40(1)")

loandata中筛选出FSTR_LOOPGROUPID为”NHNX40(1)”的数据,存储在df1中。

1
2
3
4
dftime <- data.frame()
duplicated(df1$FDT_TIME)
dftime <- df1[duplicated(df1$FDT_TIME), ]
print(dftime$FDT_TIME)

这段代码查找并打印df1中重复的时间戳。

1
2
3
4
5
Sys.setenv(TZ = 'UTC')
dftime1 <- df1[df1$FDT_TIME == dftime$FDT_TIME, ]
c2 <- apply(dftime1[, 4:6], 2, mean)
dfdata <- df1[!duplicated(df1$FDT_TIME), ]
dfdata[dfdata$FDT_TIME == dftime$FDT_TIME, 4:6] <- as.list(c2)

这部分代码:

  1. 设置系统时区为UTC
  2. 找出重复时间戳的数据
  3. 计算重复数据的4-6列的平均值
  4. 移除重复数据,并用平均值替换重复时间戳的数据

创建时间序列

1
2
3
4
5
6
7
start_time<-as.POSIXct("2010-04-18 00:00:00", tz = "UTC")
end_time <- as.POSIXct("2010-04-24 23:59:40", tz = "UTC")
time_seq <- seq(from = start_time, to = end_time, by = 20, tz = "UTC")

ts_df <- data.frame(num= time(time_seq), FDT_TIME=time_seq)
df4<-merge(dfdata, ts_df, by="FDT_TIME",all = TRUE)
dfqueshi<-subset(df4, is.na(df4$FINT_VOLUME))

这段代码创建了一个时间序列,并将其与数据合并,找出缺失值。

填充缺失值

1
2
3
4
5
6
7
8
for(i in dfqueshi$num)
{sum=df4[i-3,]$FINT_VOLUME+df4[i-2,]$FINT_VOLUME+df4[i-1,]$FINT_VOLUME
df4[i,]$FINT_VOLUME=sum/3
sum1=df4[i-3,]$FINT_SPEED+df4[i-2,]$FINT_SPEEDE+df4[i-1,]$FINT_SPEED
df4[i,]$FINT_SPEED=sum/3
sum2=df4[i-3,]$FINT_OCCUPY+df4[i-2,]$FINT_OCCUPY+df4[i-1,]$FINT_OCCUPY
df4[i,]$FINT_OCCUPY=sum/3
}

这个循环用前三个非缺失值的平均值填充缺失数据。

1
2
for(i in dfqueshi$num)
{df4$FSTR_LOOPGROUPID="NHNX40(1)"}

为所有缺失数据行添加FSTR_LOOPGROUPID值。

数据转换

1
2
3
4
5
6
df5<-df4
for(i in df5$num)
{df5[i,]$FINT_VOLUME=df5[i,]$FINT_VOLUME*180}
df20s<-df5
df5min<-df5
df15min<-df5

创建数据副本并调整FINT_VOLUME值。

5分钟数据聚合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
df5min <- df5min %>% arrange(df5min$FDT_TIME)
df5min <- df5min %>%
mutate(group = cut(df5min$FDT_TIME, "5 min")) %>%
group_by(group)
df5min_list <- split(df5min, df5min$group)
df_first_5min <- df5min_list[[1]]
new_df5min <- df5min %>%
group_by(group) %>%
summarise(sum_volume = sum(FINT_VOLUME),
sum_occupy = sum(FINT_OCCUPY),
mean_speed = mean(FINT_SPEED),
avr_volume = mean(FINT_VOLUME),
avr_occupy = mean(FINT_OCCUPY)) %>%
select(group, avr_volume, avr_occupy, mean_speed)

这段代码对数据进行5分钟的聚合,计算平均值等统计量。

15分钟数据聚合

1
2
3
4
5
6
7
8
9
10
11
12
13
14
df15min <- df15min %>% arrange(df15min$FDT_TIME)
df15min <- df15min %>%
mutate(group = cut(df15min$FDT_TIME, "15 min")) %>%
group_by(group)
df15min_list <- split(df15min, df15min$group)
df_first_15min <- df15min_list[[1]]
new_df15min <- df15min %>%
group_by(group) %>%
summarise(sum_volume = sum(FINT_VOLUME),
sum_occupy = sum(FINT_OCCUPY),
mean_speed = mean(FINT_SPEED),
avr_volume = mean(FINT_VOLUME),
avr_occupy = mean(FINT_OCCUPY)) %>%
select(group, avr_volume, avr_occupy, mean_speed)

这段代码对数据进行15分钟的聚合,计算平均值等统计量。

每日数据处理

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
dfday<-df5
dfday <- dfday %>% arrange(dfday$FDT_TIME)
dfday <- dfday %>%
mutate(group = cut(dfday$FDT_TIME, "24 hour")) %>%
group_by(group)
dfday_list <- split(dfday, dfday$group)
df_first_day <- dfday_list[[1]]

df_first_day_20s<-df_first_day
df_first_day_5min<-df_first_day
df_first_day_5min<- df_first_day_5min %>% arrange(df_first_day_5min$FDT_TIME)
df_first_day_5min <- df_first_day_5min %>%
mutate(group = cut(df_first_day_5min$FDT_TIME, "5 min")) %>%
group_by(group)
df_first_day_5min_list <- split(df_first_day_5min, df_first_day_5min$group)

df_first_day_15min<-df_first_day
df_first_day_15min<- df_first_day_15min %>% arrange(df_first_day_15min$FDT_TIME)
df_first_day_15min <- df_first_day_15min %>%
mutate(group = cut(df_first_day_15min$FDT_TIME, "15 min")) %>%
group_by(group)
df_first_day_15min_list <- split(df_first_day_15min, df_first_day_15min$group)

这段代码处理每日数据,包括20秒、5分钟和15分钟的聚合。

计算每日5分钟和15分钟平均值

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
new_df_first_day_5min <- df_first_day_5min %>%
group_by(group) %>%
summarise(
FINT_SPEED= mean(FINT_SPEED),
FINT_VOLUME= mean(FINT_VOLUME),
FINT_OCCUPY= mean(FINT_OCCUPY)) %>%
select(FINT_VOLUME, FINT_OCCUPY, FINT_SPEED)
start_time_5min<-as.POSIXct("2010-04-18 00:00:00", tz = "UTC")
end_time_5min <- as.POSIXct("2010-04-18 23:59:40", tz = "UTC")
time_seq_5min <- seq(from = start_time_5min, to = end_time_5min, by = 300, tz = "UTC")
new_df_first_day_5min$FDT_TIME=time_seq_5min

new_df_first_day_15min <- df_first_day_15min %>%
group_by(group) %>%
summarise(
FINT_SPEED = mean(FINT_SPEED),
FINT_VOLUME = mean(FINT_VOLUME),
FINT_OCCUPY = mean(FINT_OCCUPY)) %>%
select(FINT_VOLUME, FINT_OCCUPY, FINT_SPEED)
start_time_15min<-as.POSIXct("2010-04-18 00:00:00", tz = "UTC")
end_time_15min <- as.POSIXct("2010-04-18 23:59:40", tz = "UTC")
time_seq_15min <- seq(from = start_time_15min, to = end_time_5min, by = "15 min", tz = "UTC")
new_df_first_day_15min$FDT_TIME=time_seq_15min

这段代码计算每日数据的5分钟和15分钟平均值。

计算车辆长度

1
2
3
4
5
new_df_first_day_15min$midu<-new_df_first_day_15min$FINT_VOLUME/new_df_first_day_15min$FINT_SPEED
car_length<-new_df_first_day_15min$FINT_OCCUPY/new_df_first_day_15min$midu
car_length_new<-car_length[!is.na(car_length)]
avr_car_length<- mean(car_length_new)
car_length_time<-bind_cols(time_seq_15min,car_length)

这段代码计算了车辆密度和平均车长。

数据可视化

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
ggplot(df20s, aes(x = FINT_OCCUPY, y = FINT_SPEED)) +
geom_point() +
labs(x = "FINT_OCCUPY", y = "FINT_SPEED")

ggplot(df20s, aes(x = FINT_VOLUME, y = FINT_SPEED)) +
geom_point() +
labs(x = "FINT_VOLUME", y = "FINT_SPEED")

ggplot(df20s, aes(x = FINT_OCCUPY, y = FINT_VOLUME)) +
geom_point() +
labs(x = "FINT_OCCUPY", y = "FINT_VOLUME")

ggplot(new_df5min, aes(x = avr_occupy, y = mean_speed)) +
geom_point() +
labs(x = "FINT_OCCUPY", y = "FINT_SPEED")

ggplot(new_df5min, aes(x = avr_volume, y = mean_speed)) +
geom_point() +
labs(x = "FINT_VOLUME", y = "FINT_SPEED")

ggplot(new_df5min, aes(x = avr_occupy, y = avr_volume)) +
geom_point() +
labs(x = "FINT_OCCUPY", y = "FINT_VOLUME")

ggplot(new_df15min, aes(x = avr_occupy, y = mean_speed)) +
geom_point() +
labs(x = "FINT_OCCUPY", y = "FINT_SPEED")

ggplot(new_df15min, aes(x = avr_volume, y = mean_speed)) +
geom_point() +
labs(x = "FINT_VOLUME", y = "FINT_SPEED")

ggplot(new_df15min, aes(x = avr_occupy, y = avr_volume)) +
geom_point() +
labs(x = "FINT_OCCUPY", y = "FINT_VOLUME")

plot <- ggplot(df_first_day_20s, aes(x =FDT_TIME, y = FINT_SPEED)) +
xlab("时间(s)") + ylab("速度(m/s)")
plot + geom_line(data = df_first_day_20s, color = "red") +
geom_line(data = new_df_first_day_5min, aes(x = FDT_TIME, y = FINT_SPEED), color = "blue") +
geom_line(data = new_df_first_day_15min, aes(x = FDT_TIME, y = FINT_SPEED), color = "green")

plot2 <- ggplot(df_first_day_20s, aes(x =FDT_TIME, y = FINT_VOLUME)) +
xlab("时间(s)") + ylab("流量")
plot2 + geom_line(data = df_first_day_20s, color = "red") +
geom_line(data = new_df_first_day_5min, aes(x = FDT_TIME, y = FINT_VOLUME), color = "blue") +
geom_line(data = new_df_first_day_15min, aes(x = FDT_TIME, y = FINT_VOLUME), color = "green")

plot3 + geom_line(data = df_first_day_20s, color = "red") +
geom_line(data = new_df_first_day_5min, aes(x = FDT_TIME, y = FINT_OCCUPY), color = "blue") +
geom_line(data = new_df_first_day_15min, aes(x = FDT_TIME, y = FINT_OCCUPY), color = "green")

plot4 <- ggplot(new_df_first_day_15min, aes(x =FDT_TIME, y = FINT_VOLUME))
plot4 + geom_line(data = new_df_first_day_15min, color = "red") +
geom_line(data = new_df_first_day_15min, aes(x = FDT_TIME, y = FINT_SPEED*25), color = "blue")

ggplot(car_length_time, aes(x = time_seq_15min, y = car_length)) +
geom_point() +
labs(x = "时间", y = "车长")

以下是部分图像展示:

图1
图2
图3
图4
图5
图6
图7
图8
图9
图10
图11
图12
图13
图14