I recommend R package runner for this kind of operations.streak_run calculates consecutive occurrence of the same value, and sum_run calculates sum in window which length is defined by k
argument.
Here is solution:
set.seed(100)x <- round(rnorm(20, sd = 0.02), 3)n_of_sequence <- runner::streak_run(x > 0)sum <- runner::sum_run(x, k = n_of_sequence)data.frame(x, n_of_sequence, sum)# x n_of_sequence sum# 1 -0.010 1 -0.010# 2 0.003 1 0.003# 3 -0.002 1 -0.002# 4 0.018 1 0.018# 5 0.002 2 0.020# 6 0.006 3 0.026# 7 -0.012 1 -0.012# 8 0.014 1 0.014# 9 -0.017 1 -0.017# 10 -0.007 2 -0.024# 11 0.002 1 0.002# 12 0.002 2 0.004# 13 -0.004 1 -0.004# 14 0.015 1 0.015# 15 0.002 2 0.017# 16 -0.001 1 -0.001# 17 -0.008 2 -0.009# 18 0.010 1 0.010# 19 -0.018 1 -0.018# 20 0.046 1 0.046
Below benchmark to compare actual solutions
set.seed(0)x <- round(rnorm(10000, sd = 0.02), 3)library(runner)runner_streak <- function(x) { n_of_sequence <- streak_run(x > 0) sum <- sum_run(x, k = n_of_sequence)}library(data.table)dt <- data.table(x)dt_streak <- function(dt) { dt[, c("n_of_sequence", "sum") := list(seq_len(.N), cumsum(x)),rleid(sign(x))]}rle_streak <- function(x) { run_lengths <- rle(sign(x))$lengths run_lengths n_of_sequence <- run_lengths %>% map(seq) %>% unlist start <- cumsum(c(1,run_lengths)) start <- start[-length(start)] sum <- map2(start,run_lengths,~cumsum(x[.x:(.x+.y-1)])) %>% unlist()}library(tidyverse)df <- tibble(x = x)tv_streak <- function(x) { res <- df %>% mutate(seqno = cumsum(c(1, diff(sign(x)) != 0))) %>% group_by(seqno) %>% mutate(n_of_sequence = row_number(), sum = cumsum(x)) %>% ungroup() %>% select(-seqno) }count_and_sum <- function(x) { runs <- rle((x > 0) * 1)$lengths groups <- split(x, rep(1:length(runs), runs)) output <- function(group) data.frame(x = group, n = seq_along(group), sum = cumsum(group)) result <- as.data.frame(do.call(rbind, lapply(groups, output))) `rownames<-`(result, 1:nrow(result))}
microbenchmark::microbenchmark( runner_streak(x), dt_streak(dt), rle_streak(x), tv_streak(df), count_and_sum(x), times = 100L)# Unit: milliseconds# expr min lq mean median uq max neval# runner_streak(x) 4.240192 4.833563 6.321697 5.300817 6.543926 14.80221 100# dt_streak(dt) 7.648100 8.587887 10.862806 9.650483 11.295488 34.66027 100# rle_streak(x) 42.321506 55.397586 64.195692 63.404403 67.813738 167.71444 100# tv_streak(df) 31.398885 36.333751 45.141452 40.800077 45.756279 163.19535 100# count_and_sum(x) 1691.438977 1919.518282 2306.036783 2149.543281 2499.951020 6158.43384 100