import pandas as pd
      import numpy as np
      import plotly.io as pio
      pio.renderers.default = "plotly_mimetype+notebook"
      df = pd.read_csv('data_raw.csv', thousands='.')
      # Drop empty column
      df.drop(columns=['Unnamed: 1'], inplace=True)

      # Format year and months
      df.Periodo = df.Periodo.apply(lambda x: x.split()[-1] if 'Año' in x else x)
      df = df.iloc[4:,].reset_index(drop=True)
      months = ['Enero', 'Febrero', 'Marzo', 'Abril', 'Mayo', 'Junio', 'Julio', 'Agosto', 'Septiembre', 'Octubre', 'Noviembre', 'Diciembre']
      d = dict(zip(months, np.arange(1,13)))
      df['Periodo'] = df.Periodo.str.strip().replace(d)

      def format_date(row):
          if isinstance(row['Periodo'], str):
              return row['Periodo']
          year_index = row.name - row['Periodo'] 
          year = df.loc[year_index, 'Periodo']
          return '1-'+str(row['Periodo']) + '-' + year

      df['Periodo'] = df.apply(format_date, axis=1)
      df = df[df['Periodo'].str.contains('-')]
      # Change column names
      renamed = dict(zip(df.columns, ['period', 'total', 'domestic', 'low_voltage', 'high_voltage']))
      df.rename(columns=renamed, inplace=True)

      # Set datetime index
      df['period'] = pd.to_datetime(df['period'], dayfirst=True)
      df.set_index('period', inplace=True)

      # Keep only the years where we have months
      df = df['2002-01-01':]
      df.head()


                  
                    import plotly.express as px
      COLOR_MAP = dict(zip(df.columns, ['#DDBEA8',  '#488B49','#F3DFC1', '#C25B66',]))
      fig = px.line(df, width=800, 
              color_discrete_map=COLOR_MAP) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False) 
      fig.show()


                  
                    feb = df.loc['2004-02-01',]
      mar = df.loc['2004-03-01',]
      df.loc['2004-02-01':'2004-03-01','low_voltage'] = (feb['low_voltage'] + mar['low_voltage']) / 2
      df.loc['2004-02-01':'2004-03-01','total'] = (feb['total'] + mar['total']) / 2
      df.loc['2004-02-01':'2004-03-01']


                  
                    data = df.loc[:,['domestic', 'low_voltage', 'high_voltage']].resample('A').sum()
      fig = px.line(data, 
              title="Anual energy consumption in Barcelona", 
              width=800, 
              color_discrete_map=COLOR_MAP) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False) 
      fig.show()


                  
                    import plotly.graph_objects as go
      data = df.loc[:,['domestic', 'low_voltage', 'high_voltage']].rolling(12).mean()
      fig = px.line(data, 
              title="Rolling average - 12 months window", 
              width=800, 
              color_discrete_map=COLOR_MAP) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False) 
      fig.add_trace(go.Scatter(
          x=['2008', '2020'],
          y=[250000, 250000],
          text=["2008-09 Debt Crisis",
                "COVID-19"],
          mode="text",
          name=""
      ))
      fig.add_shape(type="rect",
                    xref='paper', yref='paper',
                    x0=0.35, y0=0,
                    x1=0.4, y1=1,
                    line=dict(color='rgba(250,250,250,0)'),
                    fillcolor='rgba(250,250,250,0.1)'),
      fig.add_shape(type="rect",
                    xref='paper', yref='paper',
                    x0=0.95, y0=0,
                    x1=0.98, y1=1,
                    line=dict(color='rgba(250,250,250,0)'),
                    fillcolor='rgba(250,250,250,0.1)'),

      fig.show()


                  
                    from plotly.subplots import make_subplots

      def plot_year_trend(df, column):
          sub_df = df.loc['2003':'2021', [column]]
          rolling = sub_df.rolling(12).mean()
          df_detrend = sub_df - rolling
          month = df_detrend.index.month
          year = df_detrend.index.year
          df_detrend['Month'] = month
          df_detrend['Year'] = year
          df_detrend = pd.pivot_table(df_detrend, index='Month', columns='Year', values=column)
          def get_color(year):
              if year == 2020:
                return  COLOR_MAP['high_voltage']
              if year == 2009:
                  return COLOR_MAP['domestic']
              return COLOR_MAP['low_voltage']
          color_map = {str(y):get_color(y) for y in df_detrend.columns}
          fig = px.line(df_detrend, title=f"Tendencia anual para {column}", 
                color_discrete_map=color_map
                )
          return fig


      pacf_figures = [plot_year_trend(df, 'domestic'),
                plot_year_trend(df, 'low_voltage'),
                plot_year_trend(df, 'high_voltage'),]

      fig = make_subplots(rows=len(pacf_figures), 
                          shared_xaxes=True,
                          vertical_spacing=0.01,
                          row_titles=['Domestic', 'Low voltage', 'High voltage'],
                          cols=1) 

      for i, figure in enumerate(pacf_figures):
          for trace in range(len(figure["data"])):
              fig.append_trace(figure["data"][trace], row=i+1, col=1)   

      fig.update_layout(height=750, width=800,
                        showlegend=False, 
                        title=f"""Intra-year variability in energy consumption
                        <br><sup>
                          <span style='color: {COLOR_MAP["domestic"]}'>2009</span>
                          <span style='color: {COLOR_MAP["high_voltage"]}'>2020</span>
                          <span style='color: {COLOR_MAP["low_voltage"]}'>rest of the years</span>
                          </sup>""",
                        template='plotly_dark',
                        plot_bgcolor='#34495E',
                        paper_bgcolor='#34495E',
                        )\
                        .update_xaxes(showgrid=False) \
                        .update_yaxes(showgrid=False) \
                        .update_traces(line={'width':1.5})
      fig.show()


                  
                    diff = df[['domestic', 'low_voltage', 'high_voltage']].diff()
      px.line(diff[1:],
              width=800, height=600,
              color_discrete_map=COLOR_MAP,) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',
                            ) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False) \
              .update_traces(line={'width': 1})


                  
                    general_correlation = df.iloc[:,1:].corr()
      seasonal_corr = df.diff().iloc[1:,1:].corr()
      fig = make_subplots(rows=1, cols=2,
                          shared_yaxes=True,
                          column_titles=["General correlation", "Seasonal correlation"])
      trend_traces = px.imshow(general_correlation,
                text_auto=".2f",
                color_continuous_scale=[COLOR_MAP['high_voltage'], 
                                        COLOR_MAP['low_voltage'], 
                                        COLOR_MAP['domestic']],
              x=['Domes.', 'Low V', 'High V'],
              y=['Domes.', 'Low V', 'High V']) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',
                            ) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False)['data']
      seasonal_traces = px.imshow(seasonal_corr,
                text_auto=".2f",
                x=['Domes.', 'Low V', 'High V'],
                y=['Domes.', 'Low V', 'High V'],
                color_continuous_scale=[COLOR_MAP['high_voltage'], 
                                        COLOR_MAP['low_voltage'], 
                                        COLOR_MAP['domestic']],) \
              ['data']
      for trace in trend_traces:
          fig.add_trace(trace, row=1, col=1)
      for trace in seasonal_traces:
          fig.add_trace(trace, row=1, col=2)
      fig.update_layout(template='plotly_dark',
                        width=800,
                        plot_bgcolor='#34495E',
                        paper_bgcolor='#34495E')
      fig.show()


                  
                    from statsmodels.tsa.seasonal import seasonal_decompose

      decompositions = {column: seasonal_decompose(df.loc[:,column]) for column in df.columns[1:]}

      fig = make_subplots(rows=3, cols=1, 
                          row_titles=['Trend component', 'Seasonal component', 'Residual'])
      for column in decompositions.keys():
          decompositions[column].trend.name = column
          decompositions[column].seasonal.name = column
          fig.add_trace(go.Scatter(x=decompositions[column].trend.index, 
                              y=decompositions[column].trend.dropna(),
                              line=dict(color=COLOR_MAP[column])), row=1, col=1)
          fig.add_trace(go.Scatter(x=decompositions[column].seasonal.index, 
                              y=decompositions[column].seasonal.dropna(),
                              line=dict(color=COLOR_MAP[column])), row=2, col=1)
          fig.add_trace(go.Scatter(x=decompositions[column].resid.index, 
                              y=decompositions[column].resid.dropna(),
                              line=dict(color=COLOR_MAP[column])), row=3, col=1)

      fig.update_layout(height=750, width=800,
                        showlegend=False, 
                        title=f"""Energy consumption decomposition
                        <br><sup>
                          <span style='color: {COLOR_MAP["domestic"]}'>domestic</span>
                          <span style='color: {COLOR_MAP["low_voltage"]}'>low voltage</span>
                          <span style='color: {COLOR_MAP["high_voltage"]}'>high voltage</span>
                          </sup>""",
                        template='plotly_dark',
                        plot_bgcolor='#34495E',
                        paper_bgcolor='#34495E',
                        )\
                        .update_xaxes(showgrid=False) \
                        .update_yaxes(showgrid=False) \
                        .update_traces(line={'width':1.5})
      fig.show()


                  
                    trend_df = pd.concat([dec.trend for dec in decompositions.values()], axis=1).dropna()
      seasonal_df = pd.concat([dec.seasonal for dec in decompositions.values()], axis=1).dropna()

      trend_corr = trend_df.corr()
      seasonal_corr = seasonal_df.corr()

      fig = make_subplots(rows=1, cols=2,
                          shared_yaxes=True,
                          column_titles=["Trend correlation", "Seasonal correlation"])
      trend_traces = px.imshow(trend_corr,
                text_auto=".2f",
                color_continuous_scale=[COLOR_MAP['high_voltage'], 
                                        COLOR_MAP['low_voltage'], 
                                        COLOR_MAP['domestic']],
              x=['Domes.', 'Low V', 'High V'],
              y=['Domes.', 'Low V', 'High V']) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',
                            ) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False)['data']
      seasonal_traces = px.imshow(seasonal_corr,
                text_auto=".2f",
                x=['Domes.', 'Low V', 'High V'],
                y=['Domes.', 'Low V', 'High V'],
                color_continuous_scale=[COLOR_MAP['high_voltage'], 
                                        COLOR_MAP['low_voltage'], 
                                        COLOR_MAP['domestic']],) \
              ['data']
      for trace in trend_traces:
          fig.add_trace(trace, row=1, col=1)
      for trace in seasonal_traces:
          fig.add_trace(trace, row=1, col=2)
      fig.update_layout(template='plotly_dark',
                        width=800,
                        plot_bgcolor='#34495E',
                        paper_bgcolor='#34495E')
      fig.show()


                  
                    from statsmodels.tsa.stattools import acf, pacf

      acf_df = pd.DataFrame({column:acf(df.loc[:,column], nlags=36) 
                              for column in df.columns[1:]},)
      px.line(acf_df,width=800, height=600,
              title="""Autocorrelation function
                      <br> <sup> We observe a clear correlation peak at lag 12<sup>""", 
              color_discrete_map=COLOR_MAP,) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',
                            ) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False)


                  
                    def create_corr_plot(column, plot_pacf=False):
          series = df.loc[:,column]
          corr_array = pacf(series.diff().dropna(), alpha=0.05) if plot_pacf else acf(series.diff().dropna(), alpha=0.05)
          lower_y = corr_array[1][:,0] - corr_array[0]
          upper_y = corr_array[1][:,1] - corr_array[0]

          fig = go.Figure()
          [fig.add_scatter(x=(x,x), y=(0,corr_array[0][x]), mode='lines',line_color=COLOR_MAP['total']) 
          for x in range(len(corr_array[0]))]
          fig.add_scatter(x=np.arange(len(corr_array[0])), y=corr_array[0], mode='markers', marker_color=COLOR_MAP[column],
                        marker_size=8)
          fig.add_scatter(x=np.arange(len(corr_array[0])), y=upper_y, mode='lines', line_color='rgba(255,255,255,0)')
          fig.add_scatter(x=np.arange(len(corr_array[0])), y=lower_y, mode='lines',fillcolor='rgba(250,250,250,0.1)',
                  fill='tonexty', line_color='rgba(255,255,255,0)')
          fig.update_traces(showlegend=False)
          fig.update_yaxes(zerolinecolor='#000000')    
          return fig

      fig = make_subplots(rows=3, cols=2,
                          column_titles=['Autocorrelation', 'Partial autocorrelation'],
                          row_titles=['Domestic', 'Low voltage', 'High voltage'])
      pacf_figures = [create_corr_plot(column, True) for column in df.columns[1:]]
      acf_figures = [create_corr_plot(column) for column in df.columns[1:]]
      for i, figure in enumerate(pacf_figures):
          for trace in range(len(figure["data"])):
              fig.append_trace(figure["data"][trace], row=i+1, col=2)   

      for i, figure in enumerate(acf_figures):
          for trace in range(len(figure["data"])):
              fig.append_trace(figure["data"][trace], row=i+1, col=1)   

      fig.update_layout(
                            width=800,height=900,
                            template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',
                            ) \
                            .update_xaxes(showgrid=False) \
                            .update_yaxes(showgrid=False) 
      fig.show()


                  
                    from statsmodels.tsa.stattools import adfuller
      raw = [print(f"{column}: {adfuller(df.loc[:,column])[1]}") for column in df.columns[1:]]


                  
                    first_order = [print(f"First order differencing {column}: {adfuller(df.loc[:,column].diff().dropna())[1]}") for column in df.columns[1:]]


                  
                    pacf_plot = create_corr_plot('domestic', plot_pacf=True) 
      acf_plot = create_corr_plot('domestic') 
      fig = make_subplots(rows=1, cols=2,
                          column_titles=['Autocorrelation', 'Partial autocorrelation'],
                          )
      for trace in range(len(pacf_plot["data"])):
          fig.append_trace(pacf_plot["data"][trace], row=1, col=2)   

      for trace in range(len(acf_plot["data"])):
          fig.append_trace(acf_plot["data"][trace], row=1, col=1)   
      fig.update_layout(title="Domestic consumption", width=800,
                        template='plotly_dark',
                        plot_bgcolor='#34495E',
                        paper_bgcolor='#34495E',
                        ) \
                        .update_xaxes(showgrid=False) \
                        .update_yaxes(showgrid=False)


                  
                    from itertools import product
      from statsmodels.tsa.arima.model import ARIMA
      from sklearn.metrics import mean_squared_error as mse
      from statsmodels.tools.sm_exceptions import ConvergenceWarning, ValueWarning

      import warnings
      warnings.simplefilter('ignore', ValueWarning)
      warnings.simplefilter('ignore', ConvergenceWarning)
      # warnings.simplefilter('ignore', UserWarning)
      split_idx = df.index[int(len(df) * .8)]
      domestic_train = df.loc[:split_idx, 'domestic']
      domestic_test = df.loc[split_idx:, 'domestic']

      p_range = range(1,12)
      d_range = range(0,2)
      q_range = range(1,13)

      pdq_combinations = list(product(p_range, d_range, q_range))
      combinations =[]
      rmses = []
      for pdq in pdq_combinations:
          try:
              model = ARIMA(domestic_train, order=pdq).fit()
              prediction = model.predict(start=len(domestic_train), end=len(df))
              rmse = np.sqrt(mse(domestic_test, prediction))
              combinations.append(pdq)
              rmses.append(rmse)
          except Exception as e:
              continue

      results = pd.DataFrame({'combination': combinations, 'rmse': rmses})


                  
                    comb = results.loc[results.rmse.argmin(), 'combination']
      error = results.rmse.min()
      model = ARIMA(domestic_train, order=(4,0,6)).fit()
      forecast = model.predict(len(domestic_train), len(df))
      px.line(pd.concat([df.loc[split_idx:,'domestic'], forecast], axis=1),
              title=f"""ARIMA model prediction
              <br><sup>p={comb[0]}, d={comb[1]}, q={comb[2]}  |  RMSE = {error:.2f}</sup>""",
              width=800,
              color_discrete_map=COLOR_MAP,) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',
                            ) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False)


                  
                    model = SARIMAX(domestic_train, order=(8,0,6), seasonal_order=(1,1,1,12)).fit(disp=False)
      forecast = model.get_forecast(steps=len(df) - len(domestic_train),)
      fig = px.line(pd.concat([df.loc[split_idx:,'domestic'], forecast.predicted_mean], axis=1),
              title=f"""SARIMA model prediction
              <br><sup>p={8}, d={0}, q={6}  |  P=1, D=1, Q=1, m=12  |  RMSE = {error:.2f}  |  Confidence interval in grey</sup>""",
              width=800,
              color_discrete_map=COLOR_MAP,) \
              .update_layout(template='plotly_dark',
                            plot_bgcolor='#34495E',
                            paper_bgcolor='#34495E',
                            ) \
              .update_xaxes(showgrid=False) \
              .update_yaxes(showgrid=False)
      confidence_intervals = forecast.conf_int()
      fig.add_trace(go.Scatter(x=confidence_intervals.index, 
                              y=confidence_intervals['lower domestic'], 
                              line=dict(color='rgba(0,0,0,0)'), showlegend=False))
      fig.add_trace(go.Scatter(x=confidence_intervals.index, 
                              y=confidence_intervals['upper domestic'], 
                              line=dict(color='rgba(0,0,0,0)'),
                              fillcolor='rgba(250,250,250,0.1)',
                              fill='tonexty', showlegend=False))
      fig.show()

	total	domestic	low_voltage	high_voltage
period
2002-01-01	586030	276261	167946	141823
2002-02-01	551446	219226	150389	181831
2002-03-01	608499	269371	164810	174318
2002-04-01	440794	185130	137355	118309
2002-05-01	517328	262391	157361	97576

	total	domestic	low_voltage	high_voltage
period
2004-02-01	594331.5	271946	161593	162441
2004-03-01	594331.5	261995	161593	169095

Sebastian D. Data Solutions

Time Series Analysis - Energy consumption in Barcelona¶

Data cleaning and preparation¶

Start Exploratory Data Analysis¶

Long-term trends - rolling average¶

Intra-year variability - detrending¶

Periodicity - First-order differencing¶

Correlation¶

Decomposition¶

Autocorrelation¶

Time series forecasting¶

ARIMA models¶

SARIMA models¶

Conclusion¶

Further steps¶