Contents

Conceptual Rainfall-Runoff Model

Contents

import numpy as np
import pandas as pd
import time

import ipywidgets as widgets
from ipywidgets import interact, interactive

from bokeh.plotting import figure, show
from bokeh.io import output_notebook, push_notebook
from bokeh.models import LinearAxis, Range1d, ColumnDataSource
from bokeh.layouts import column, row

import numba

output_notebook()

Loading BokehJS ...

Conceptual Rainfall-Runoff Model#

In this notebook, we’re going to set up and calibrate a basic conceptual rainfall-runoff model and calibrate it to some precipitation and runoff data. The conceptual model is as follows:


]	$\(Q_1 = a_1 \cdot (S_1 - h_{1})^{m_1}\)\( \)\(Q_{12} = a_{12} \cdot S_1^{m_{12}}\)\( \)\(Q_2 = a_{2} \cdot S_2^{m_2}\)$

Import Climate Data#

We have observations of precipitation and runoff (and estimates of evaporation) at a frequency of 20 minutes. We would like to import them and set up empty vectors to represent the state variables for tank levels and the fluxes.

def initialize_df(df):
    # initialize the columns from the input data
    cols = ['S1 (mm)', 'Q1 (mm/h)', 'Q12 (mm/h)', 'S2 (mm)', 'Q2 (mm/h)', 'Q1+Q2 (mm/h)']
    df.loc[:, cols] = np.nan
    # calculate the timestep in minutes
    # df['Date'] = pd.to_datetime(df['Date'].to_numpy())
    # get the timestep and convert it to hours
    df['dt_hours'] = df['Date'].diff().dt.total_seconds().div(3600)
    # for some reason the Date column is unsorted, so sort it
    df.sort_values('Date', inplace=True)
    return df

raw_data = pd.read_csv('data/data.csv', parse_dates=['Date'], infer_datetime_format=True, dayfirst=True)
df = initialize_df(raw_data)
df.set_index('Date', inplace=True)
df.head()

	Rain (mm/h)	Evap (mm/h)	Runoff (mm/h)	S1 (mm)	Q1 (mm/h)	Q12 (mm/h)	S2 (mm)	Q2 (mm/h)	Q1+Q2 (mm/h)	dt_hours
Date
1987-09-02 00:20:00	0.0	0.0	0.038	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1987-09-02 00:40:00	0.0	0.0	0.038	NaN	NaN	NaN	NaN	NaN	NaN	0.333333
1987-09-02 01:00:00	0.0	0.0	0.038	NaN	NaN	NaN	NaN	NaN	NaN	0.333333
1987-09-02 01:20:00	0.0	0.0	0.038	NaN	NaN	NaN	NaN	NaN	NaN	0.333333
1987-09-02 01:40:00	0.0	0.0	0.038	NaN	NaN	NaN	NaN	NaN	NaN	0.333333

# this will check if any value in any row is null (i.e. NaN)
# df[df[['Rain (mm/h)', 'Evap (mm/h)', 'Runoff (mm/h)']].isnull().any(axis=1)]
# plot_cols = [c for c in df.columns if c not in ['dt', 'Date']]

p_label = 'Rain (mm/h)'
q_label = 'Runoff (mm/h)'
e_label = 'Evap (mm/h)'

p = figure(title='Observed Precipitation and Runoff', 
           x_axis_type='datetime', width=800, height=400)
p.line(df.index, df[q_label], line_width=2,
       legend_label=q_label, color='dodgerblue')
p.line(df.index, df[e_label], line_width=2,
       legend_label=e_label, color='orange')
p.xaxis.axis_label = 'Date'
p.yaxis.axis_label = q_label

p.extra_y_ranges = {"precip": Range1d(df[p_label].max(), 0)}
# data is in 20 minute intervals, and time is in milliseconds,
# so set width to 3.6E6 / 3
precip = p.vbar(
    x=df.index, 
    top=0,
    bottom=df[p_label],
    width=20 * 60E3,
    y_range_name='precip', 
    legend_label=p_label, 
    alpha=0.5,
    color='lightseagreen',
    
)
precip.level = 'underlay'
p.add_layout(LinearAxis(axis_label=p_label, y_range_name='precip'), "right")

show(p)

def update_df():
    # if you are working in Jupyter lab, you can import the data just once.  
    # However, there might be issues later on if you are updating the dataframe in 
    # such a way that values from another iteration interfere.  
    # it may be a case of just running a modified version of the 'initialize_df'
    # function where you just reset the desired columns to nan.  This won't take as much
    # time as reading the csv and converting the Date column to pandas datetime type.    
    # df = initialize_df(pd.read_csv('data/data.csv', parse_dates=['Date'], dayfirst=True))       
    # set_initial rain and evap as zero
    rain_0, evap_0 = 0, 0
    
    for i in range(0, len(df)):
        
        if i == 0:
            this_S1 = S1_init.value
            this_S2 = S2_init.value
        else:
            # get the timestep and convert it to hours
            dt = df.at[i, 'dt_hours']
            
            rain = df.at[i - 1, 'Rain (mm/h)']
            evap = df.at[i - 1, 'Evap (mm/h)']
            
            q1 = df.at[i - 1, 'Q1 (mm/h)']
            q12 = df.at[i - 1, 'Q12 (mm/h)']
            q2 = df.at[i - 1, 'Q2 (mm/h)']
            S1 = df.at[i - 1, 'S1 (mm)']
            S2 = df.at[i - 1, 'S2 (mm)']
            
            # since our dt is in hours and fluxes in mm/h
            # multiply by the timestep to get the sub-hourly fluxes
            delta_q1 = (rain - evap - q1 - q12) * dt 
            # if i < 20:
            #     print(f'@dt={dt} delta q1 = {delta_q1:.3f}, rain={rain:.1f}, evap={evap:.1f}, q1={q1:.2f}, q12={q12:.2f}')
            this_s1 = max(0, S1 + delta_q1)

            delta_q2 = (q12 - q2) * dt
            this_s2 = max(0, S2 + delta_q2)
            
        # set reservoir levels for the current timestep
        df.at[i, 'S1 (mm)'] = this_S1
        df.at[i, 'S2 (mm)'] = this_S2
        
        # set flows for the current timestep based on current levels
        df.at[i, 'Q1 (mm/h)'] = a1.value * (this_S1 - h1.value)**m1.value
        df.at[i, 'Q12 (mm/h)'] = a12.value * (this_S1)**m12.value
        df.at[i, 'Q2 (mm/h)'] = a2.value * (this_S2)**m2.value
        df.at[i, 'Q1+Q2 (mm/h)'] = df.at[i, 'Q1 (mm/h)'] + df.at[i, 'Q2 (mm/h)']
    return df

Interactive Inputs#

@numba.jit(nopython=True)
def update_states_vectorized(rain, evap, dt, a1, m1, h1, a12, m12, a2, m2, S1_init, S2_init):
#     df = initialize_df(pd.read_csv('data/data.csv', parse_dates=True, infer_datetime_format=True))
    # for jit compiling, we need to initialize empty vectors  
    # for each of the input columns from the pandas method
    s1, s2 = np.empty(len(rain)), np.empty(len(rain))
    q1, q2 = np.empty(len(rain)), np.empty(len(rain))
    q12 = np.empty(len(rain))
    q_tot = np.empty(len(rain))
    
    # set initial reservoir and outflow values
    s1[0] = S1_init
    s2[0] = S2_init
    q1[0] = 0 if s1[0] <= h1 else (a1 * (s1[0] - h1)**m1 ) * dt[1]
    q12[0] = (a12 * (s1[0])**m12) * dt[1]  
    q2[0] = (a2 * s2[0]**m2) * dt[1]
    q_tot[0] = q1[0] + q2[0]
    
    for i in range(1, len(rain)):

        q1_balance = (rain[i-1] - evap[i-1] - q1[i-1] - q12[i-1]) * dt[i]
        this_S1 = max(0, s1[i - 1] + q1_balance)

        delta_q2 =  (q12[i - 1] - q2[i - 1]) / dt[i]
        this_S2 = max(0, s2[i - 1] + delta_q2)

        s1[i] = this_S1
        s2[i] = this_S2

        q1[i] = 0 if this_S1 <= h1 else (a1 * (this_S1 - h1)**m1) * dt[i]
        q12[i] = 0 if this_S1 <= 0 else (a12 * (this_S1)**m12) * dt[i]
        q2[i] = 0 if this_S2 <= 0 else (a2 * this_S2**m2) * dt[i]

        q_tot[i] = q1[i] + q2[i]

    return s1, s2, q1, q2, q12, q_tot

def vectorized_df_update(a1, m1, h1, a12, m12, a2, m2, S1_init, S2_init):
    p_label = 'Rain (mm/h)'
    e_label = 'Evap (mm/h)'
    rain = df[p_label].to_numpy()
    evap = df[e_label].to_numpy()
    dt = df['dt_hours'].to_numpy()
    s1, s2, q1, q2, q12, q_tot = update_states_vectorized(rain, evap, dt, a1, m1, h1, a12, m12, a2, m2, S1_init, S2_init)
    df['S1 (mm)'] = s1
    df['S2 (mm)'] = s2
    df['Q1 (mm/h)'] = q1
    df['Q2 (mm/h)'] = q2
    df['Q12 (mm/h)'] = q12
    df['Q1+Q2 (mm/h)'] = q_tot 
    return df
    

def plot_model(a1, m1, h1, a12, m12, a2, m2, S1_init, S2_init):
        
    # df = update_df()
    df = vectorized_df_update(a1, m1, h1, a12, m12, a2, m2, S1_init, S2_init)
    source = ColumnDataSource(df)
    
    q_label = 'Runoff (mm/h)'
    sim_label = 'Q1+Q2 (mm/h)'
    
    fig = figure(width=800, height=300, x_axis_type='datetime',
                title='Observed vs. Simulated Runoff')
    
    # Plot Measured vs. Simuilated Runoff
    fig.line('Date', q_label, color='dodgerblue', line_width=2, 
             legend_label='Observed', source=source)
    fig.line('Date', sim_label, color='navy', line_width=2, 
             line_dash='dashed', legend_label='Simulated',
             source=source)
    
    fig.yaxis.axis_label = 'Runoff (mm/h)'
    
    # Tank 1 and Tank 2 flow plot    
    f1 = figure(title='Tank 1 and 2 Flow', width=800, height=150,
               x_axis_type='datetime')
    f1.line('Date', 'Q1 (mm/h)', legend_label='Tank 1', 
            line_width=2, color='purple', source=source)
    f1.line('Date', 'Q2 (mm/h)', legend_label='Tank 2', 
            line_width=2, color='navy', source=source)
    f1.yaxis.axis_label = 'Runoff (mm/h)'
    
    # Regression Plot
    f2 = figure(width=400, height=450, title="Measured vs. Simulated Regression")
    f2.circle('Runoff (mm/h)', 'Q1+Q2 (mm/h)', 
            source=source)
    
    
    # drop NaN values for Least-Squares fit
    df.dropna(inplace=True)
    
    # Best-fit line for regression plot
    fit = np.polyfit(df['Runoff (mm/h)'], df['Q1+Q2 (mm/h)'], 1)
    
    x = np.linspace(df['Runoff (mm/h)'].min(), df['Runoff (mm/h)'].max(), 100)
    y = [fit[0] * e + fit[1] for e in x]
    f2.line(x, y, color='red', line_dash='dashed')
    
    # calculate the R^2 and the NSE
    df['square_diff_Q1_Q2'] = (df['Runoff (mm/h)'] - df['Q1+Q2 (mm/h)'])**2
    
    # set a lower limit value to avoid divide by zero errors    
    df = df[(df['Q1+Q2 (mm/h)'] != 0) & (df['Runoff (mm/h)'] != 0)].copy()
    df['log_square_diff_Q1_Q2'] = (np.log(df['Runoff (mm/h)']) - np.log(df['Q1+Q2 (mm/h)']))**2
    df['log_square_diff_Q1_Q2'] = df[np.isfinite(df['log_square_diff_Q1_Q2'])]['log_square_diff_Q1_Q2']
    nse = 1 - df['square_diff_Q1_Q2'].mean() / np.var(df['Runoff (mm/h)'])
#     log_nse = df['log_square_diff_Q1_Q2'].mean() / np.var(df['Runoff (mm/h)'])
    performance_str = f'Sim. vs. Obs Regression (NSE={nse:.2f})'
    f2.title = performance_str
    # print(performance_str)
    layout = row(column(fig, f1), f2)
    show(layout)
    # plt.annotate(performance_str, xy=(0, 0),
    #                xytext=(5, 340),
    #                textcoords="offset points",
    #                ha='left', va='center',
    #                color='black', weight='bold', 
    #                clip_on=True)
    

from ipywidgets import TwoByTwoLayout, interactive_output, HBox
from IPython.display import display

a1 = widgets.FloatSlider(min=0., max=0.5, step=0.01, value=0.02, orientation='vertical', description='a1', continuous_update=False)
m1 = widgets.FloatSlider(min=0., max=2., step=0.01, value=1.0, orientation='vertical', description='m1', continuous_update=False)
h1 = widgets.FloatSlider(min=0., max=2., step=0.01, value=0.1, orientation='vertical', description='h1', continuous_update=False)
a12 = widgets.FloatSlider(min=0., max=0.5, step=0.01, value=0.021, orientation='vertical', description='a12', continuous_update=False)
m12 = widgets.FloatSlider(min=0., max=2., step=0.01, value=1., orientation='vertical', description='m12', continuous_update=False)
a2 = widgets.FloatSlider(min=0., max=1., step=0.01, value=0.2, orientation='vertical', description='a2', continuous_update=False)
m2 = widgets.FloatSlider(min=0., max=2., step=0.01, value=1.1, orientation='vertical', description='m2', continuous_update=False)
S1_init = widgets.FloatSlider(min=0., max=2., step=0.01, value=1.0, orientation='vertical', description='t1_init', continuous_update=False)
S2_init = widgets.FloatSlider(min=0., max=2., step=0.01, value=1.0, orientation='vertical', description='t2_init', continuous_update=False)

df = initialize_df(pd.read_csv('data/data.csv', parse_dates=['Date'], dayfirst=True))

sliders = {
    'a1': a1, 'm1': m1, 'h1': h1,
    'a12': a12, 'm12': m12, 
    'a2': a2, 'm2': m2, 'S1_init': S1_init, 'S2_init': S2_init,
}

out = interactive_output(
    plot_model, 
    sliders
)

display(HBox([a1, m1, h1, a12, m12, a2, m2, S1_init, S2_init]), out)

Examine Computational Performance#

Test the cycle time of the original function and compare to an updated (vectorized) function.

param_list = [('a1', 0.33), 
              ('m1', 1.0), 
              ('h1', 1.8), 
              ('a12', 0.02), 
              ('m12', 0.6), 
              ('a2', 0.02), 
              ('m2', 0.8), 
              ('S1_init', 1.0), 
              ('S2_init', 1.0)
             ]

params = {p[0]: max(0, np.random.normal(p[1])) for p in param_list}

# run some number of loops to generate an average execution time
times1, times2 = [], []
n_iterations = 10
for i in range(n_iterations):
    t0 = time.time()
    df = update_df()
    t1 = time.time()
    
    rain = df['Rain (mm/h)'].to_numpy()
    evap = df['Evap (mm/h)'].to_numpy()
    dt = df['dt_hours'].to_numpy()
    s1, s2, q1, q2, q12, q_tot = update_states_vectorized(rain=rain, evap=evap, dt=dt, **params)
    t2 = time.time()
    times1.append(t1 - t0)
    times2.append(t2 - t1)

mean1 = np.mean(times1)
mean2 = np.mean(times2)
print(f'Avg. of {n_iterations} iterations:')
print(f'    update_df: {mean1:.2f}/loop')
print(f'    update_df_update: {mean2:2f}/loop')
print(f'Vectorization speeds up computation ~{round(mean1/mean2, 0)}x')

Avg. of 10 iterations:
    update_df: 1.84/loop
    update_df_update: 0.036734/loop
Vectorization speeds up computation ~50.0x