In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from plotly.subplots import make_subplots
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from prophet import Prophet
import warnings 
warnings.filterwarnings('ignore') 



# Author: Mehul Parashar

# Load the dataset
file_path = 'surface temperature anomalies by year.csv'
data = pd.read_csv(file_path)

# Rename the first column to 'Country' and correct other columns
data.columns = ['Country'] + data.columns[1:].tolist()

# Extract relevant columns for analysis
years = [col for col in data.columns if col.isdigit()]
anomalies_data = data[['Country'] + years]

# Convert year columns to numeric, handling errors in the process
anomalies_data[years] = anomalies_data[years].apply(pd.to_numeric, errors='coerce')

# Drop any rows with all NaN values for years
anomalies_data = anomalies_data.dropna(subset=years, how='all')

# Calculate global mean temperature anomalies by year
global_anomalies = anomalies_data[years].mean()

# Apply a moving average to smooth the data
window_size = 5  # You can adjust the window size for more or less smoothing
smoothed_anomalies = global_anomalies.rolling(window=window_size).mean()

# Initial Visualization
plt.figure(figsize=(12, 6))
plt.plot(smoothed_anomalies.index, smoothed_anomalies.values, marker='o', linestyle='-', color='b')
plt.title('Global Surface Temperature Anomalies (Yearly Average)')
plt.xlabel('Year')
plt.ylabel('Temperature Anomaly (°C)')
plt.grid(True)
plt.xticks(rotation=90)
plt.show()

# List of selected countries
selected_countries = ['United States', 'China', 'India', 'Brazil', 'Russia']

# Extract data for the selected countries
selected_data = anomalies_data[anomalies_data['Country'].isin(selected_countries)]

# Plotting the temperature anomalies for selected countries
plt.figure(figsize=(14, 8))

for country in selected_countries:
    country_data = selected_data[selected_data['Country'] == country].iloc[:, 1:].T
    country_data.columns = [country]
    plt.plot(country_data.index, country_data[country], marker='o', linestyle='-', label=country)

plt.title('Surface Temperature Anomalies for Selected Countries')
plt.xlabel('Year')
plt.ylabel('Temperature Anomaly (°C)')
plt.legend()
plt.grid(True)
plt.xticks(rotation=90)
plt.show()

# Historical events to highlight
historical_events = {
    'Industrial Revolution': (1760, 1840),
    'World War I': (1914, 1918),
    'Great Depression': (1929, 1939),
    'World War II': (1939, 1945),
    'Oil Crisis': (1973, 1973),
    'COVID-19 Pandemic': (2020, 2020)
}

# Initialize the figure
fig = go.Figure()

# Add country-specific traces
for country in selected_countries:
    country_data = selected_data[selected_data['Country'] == country].iloc[:, 1:].T
    country_data.columns = [country]
    fig.add_trace(go.Scatter(x=country_data.index, y=country_data[country], mode='lines+markers', name=country))

# Add historical events as shaded areas
for event, (start, end) in historical_events.items():
    fig.add_shape(
        type="rect",
        x0=start, x1=end,
        y0=min(global_anomalies), y1=max(global_anomalies),
        fillcolor="LightSalmon", opacity=0.5, layer="below", line_width=0,
        name=event
    )
    fig.add_trace(go.Scatter(
        x=[(start + end) / 2],
        y=[max(global_anomalies)],
        text=[event],
        mode="text",
        showlegend=False
    ))

# Update layout for dropdown menu
buttons = []
for country in selected_countries:
    visible = [country == trace.name for trace in fig.data]
    buttons.append(
        dict(label=country,
             method="update",
             args=[{"visible": visible},
                   {"title": f"Temperature Anomalies: {country}"}]))

fig.update_layout(
    updatemenus=[dict(active=0, buttons=buttons, x=1.15, xanchor="right", y=1.1, yanchor="top")],
    title="Surface Temperature Anomalies with Significant Historical Events",
    xaxis_title="Year",
    yaxis_title="Temperature Anomaly (°C)",
    showlegend=False
)

# Show the interactive plot
fig.show()

# Function to forecast temperature anomalies and calculate metrics
def forecast_anomalies(data, country, years, forecast_steps=20):
    country_data = data[data['Country'] == country][years].T
    if not country_data.empty:
        country_data.columns = [country]
        country_anomalies = country_data[country].dropna()

        model = ARIMA(country_anomalies, order=(5, 1, 0))
        model_fit = model.fit()

        forecast = model_fit.forecast(steps=forecast_steps)
        forecast_years = np.arange(int(country_anomalies.index.max()) + 1, int(country_anomalies.index.max()) + 1 + forecast_steps)

        # Calculate metrics
        aic = model_fit.aic
        bic = model_fit.bic
        mae = mean_absolute_error(country_anomalies, model_fit.predict(start=0, end=len(country_anomalies)-1))
        mse = mean_squared_error(country_anomalies, model_fit.predict(start=0, end=len(country_anomalies)-1))
        pvalues = model_fit.pvalues

        metrics = {
            'AIC': aic,
            'BIC': bic,
            'MAE': mae,
            'MSE': mse,
            'P-Values': pvalues
        }

        return country_anomalies, forecast, forecast_years, metrics
    return None, None, None, None

# Prepare data for global anomalies
global_anomalies = anomalies_data[years].mean()

# Fit ARIMA model for global anomalies
global_model = ARIMA(global_anomalies, order=(5, 1, 0))
global_model_fit = global_model.fit()

# Forecast for global anomalies
global_forecast_steps = 20
global_forecast = global_model_fit.forecast(steps=global_forecast_steps)
global_forecast_years = np.arange(int(global_anomalies.index.max()) + 1, int(global_anomalies.index.max()) + 1 + global_forecast_steps)

# Calculate metrics for global anomalies
global_aic = global_model_fit.aic
global_bic = global_model_fit.bic
global_mae = mean_absolute_error(global_anomalies, global_model_fit.predict(start=0, end=len(global_anomalies)-1))
global_mse = mean_squared_error(global_anomalies, global_model_fit.predict(start=0, end=len(global_anomalies)-1))
global_pvalues = global_model_fit.pvalues

global_metrics = {
    'AIC': global_aic,
    'BIC': global_bic,
    'MAE': global_mae,
    'MSE': global_mse,
    'P-Values': global_pvalues
}

# Print global metrics
print("Global Model Metrics:")
for key, value in global_metrics.items():
    print(f"{key}: {value}")

# Create the figure
fig = go.Figure()

# Add global data trace
fig.add_trace(go.Scatter(
    x=global_anomalies.index,
    y=global_anomalies.values,
    mode='lines+markers',
    name='Global Historical Data'
))

fig.add_trace(go.Scatter(
    x=global_forecast_years,
    y=global_forecast,
    mode='lines+markers',
    name='Global Forecast',
    line=dict(dash='dash', color='red')
))

# Add traces for each selected country and print their metrics
for country in selected_countries:
    country_anomalies, country_forecast, country_forecast_years, country_metrics = forecast_anomalies(anomalies_data, country, years)
    
    if country_anomalies is not None:
        fig.add_trace(go.Scatter(
            x=country_anomalies.index,
            y=country_anomalies.values,
            mode='lines+markers',
            name=f'{country} Historical Data'
        ))
        
        fig.add_trace(go.Scatter(
            x=country_forecast_years,
            y=country_forecast,
            mode='lines+markers',
            name=f'{country} Forecast',
            line=dict(dash='dash', color='red')
        ))

        # Print country metrics
        print(f"\n{country} Model Metrics:")
        for key, value in country_metrics.items():
            print(f"{key}: {value}")

# Update the layout
fig.update_layout(
    title='Surface Temperature Anomalies with Future Projections for Selected Countries',
    xaxis_title='Year',
    yaxis_title='Temperature Anomaly (°C)',
    xaxis=dict(tickangle=45, dtick=5),
    legend=dict(x=1, y=1, traceorder='normal'),
    margin=dict(l=40, r=40, t=40, b=40),
    width=1000,
    height=600
)

# Show the interactive plot
fig.show()

# Function to calculate Z-scores
def calculate_z_scores(data):
    mean = np.mean(data)
    std = np.std(data)
    z_scores = (data - mean) / std
    return z_scores

# Function to calculate IQR and identify outliers
def calculate_iqr_outliers(data):
    Q1 = np.percentile(data, 25)
    Q3 = np.percentile(data, 75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = data[(data < lower_bound) | (data > upper_bound)]
    return outliers

# Calculate Z-scores for global anomalies with threshold 1.5
global_z_scores = calculate_z_scores(global_anomalies)
z_score_outliers = global_anomalies[np.abs(global_z_scores) > 1.5]

# Calculate IQR outliers for global anomalies
iqr_outliers = calculate_iqr_outliers(global_anomalies)

# Print outliers
print("Z-Score Outliers in Global Temperature Anomalies (threshold 1.5):")
print(z_score_outliers)
print("\nIQR Outliers in Global Temperature Anomalies:")
print(iqr_outliers)

# Plot global anomalies and highlight outliers
fig = go.Figure()

fig.add_trace(go.Scatter(
    x=global_anomalies.index,
    y=global_anomalies.values,
    mode='lines+markers',
    name='Global Anomalies'
))

fig.add_trace(go.Scatter(
    x=z_score_outliers.index,
    y=z_score_outliers.values,
    mode='markers',
    marker=dict(color='red', size=10),
    name='Z-Score Outliers'
))

fig.add_trace(go.Scatter(
    x=iqr_outliers.index,
    y=iqr_outliers.values,
    mode='markers',
    marker=dict(color='orange', size=10),
    name='IQR Outliers'
))

# Update layout
fig.update_layout(
    title='Global Surface Temperature Anomalies with Outliers',
    xaxis_title='Year',
    yaxis_title='Temperature Anomaly (°C)',
    xaxis=dict(tickangle=45, dtick=5),
    margin=dict(l=40, r=40, t=40, b=40),
    width=1000,
    height=600
)

# Show the interactive plot
fig.show()

# Fit ARIMA model for global anomalies
arima_model = ARIMA(global_anomalies, order=(5, 1, 0))
arima_fit = arima_model.fit()
arima_forecast = arima_fit.forecast(steps=20)

# Fit ETS model for global anomalies
ets_model = ExponentialSmoothing(global_anomalies, trend='add', seasonal=None)
ets_fit = ets_model.fit()
ets_forecast = ets_fit.forecast(steps=20)

# Prepare data for Prophet model
prophet_data = global_anomalies.reset_index()
prophet_data.columns = ['ds', 'y']
prophet_model = Prophet()
prophet_model.fit(prophet_data)
future = prophet_model.make_future_dataframe(periods=20, freq='Y')
prophet_forecast = prophet_model.predict(future)

# Create an ensemble forecast by averaging the forecasts
ensemble_forecast = (arima_forecast + ets_forecast + prophet_forecast['yhat'][-20:].values) / 3

# Calculate metrics for the ensemble model
ensemble_true = global_anomalies.values[-20:]
ensemble_mae = mean_absolute_error(ensemble_true, ensemble_forecast)
ensemble_mse = mean_squared_error(ensemble_true, ensemble_forecast)

print("Ensemble Model Metrics:")
print(f"MAE: {ensemble_mae}, MSE: {ensemble_mse}\n")

# Create the figure for comparison
fig = go.Figure()

# Add historical data trace
fig.add_trace(go.Scatter(
    x=global_anomalies.index,
    y=global_anomalies.values,
    mode='lines+markers',
    name='Historical Data'
))

# Add ARIMA forecast trace
fig.add_trace(go.Scatter(
    x=np.arange(int(global_anomalies.index.max()) + 1, int(global_anomalies.index.max()) + 1 + 20),
    y=arima_forecast,
    mode='lines+markers',
    name='ARIMA Forecast',
    line=dict(dash='dash')
))

# Add ETS forecast trace
fig.add_trace(go.Scatter(
    x=np.arange(int(global_anomalies.index.max()) + 1, int(global_anomalies.index.max()) + 1 + 20),
    y=ets_forecast,
    mode='lines+markers',
    name='ETS Forecast',
    line=dict(dash='dot')
))

# Add Prophet forecast trace
fig.add_trace(go.Scatter(
    x=prophet_forecast['ds'][-20:],
    y=prophet_forecast['yhat'][-20:],
    mode='lines+markers',
    name='Prophet Forecast',
    line=dict(dash='dashdot')
))

# Add Ensemble forecast trace
fig.add_trace(go.Scatter(
    x=np.arange(int(global_anomalies.index.max()) + 1, int(global_anomalies.index.max()) + 1 + 20),
    y=ensemble_forecast,
    mode='lines+markers',
    name='Ensemble Forecast',
    line=dict(dash='solid', color='black')
))

# Update the layout
fig.update_layout(
    title='Global Surface Temperature Anomalies with Different Forecasting Models',
    xaxis_title='Year',
    yaxis_title='Temperature Anomaly (°C)',
    xaxis=dict(tickangle=45, dtick=5),
    legend=dict(x=1, y=1, traceorder='normal'),
    margin=dict(l=40, r=40, t=40, b=40),
    width=1200,
    height=800
)

# Show the interactive plot
fig.show()

# Function to forecast temperature anomalies for a country
def forecast_anomalies(data, country, years, forecast_steps=20):
    country_data = data[data['Country'] == country][years].T
    if not country_data.empty:
        country_data.columns = [country]
        country_anomalies = country_data[country].dropna()

        model = ARIMA(country_anomalies, order=(5, 1, 0))
        model_fit = model.fit()

        forecast = model_fit.forecast(steps=forecast_steps)
        forecast_years = np.arange(int(country_anomalies.index.max()) + 1, int(country_anomalies.index.max()) + 1 + forecast_steps)
        
        return country_anomalies, forecast, forecast_years
    return None, None, None

# Add traces for each selected country
for country in selected_countries:
    country_anomalies, country_forecast, country_forecast_years = forecast_anomalies(anomalies_data, country, years)
    
    if country_anomalies is not None:
        fig.add_trace(go.Scatter(
            x=country_anomalies.index,
            y=country_anomalies.values,
            mode='lines+markers',
            name=f'{country} Historical Data'
        ))
        
        fig.add_trace(go.Scatter(
            x=country_forecast_years,
            y=country_forecast,
            mode='lines+markers',
            name=f'{country} Forecast',
            line=dict(dash='dash', color='red')
        ))

# Update the layout for all data
fig.update_layout(
    title='Surface Temperature Anomalies with Future Projections for Selected Countries',
    xaxis_title='Year',
    yaxis_title='Temperature Anomaly (°C)',
    xaxis=dict(tickangle=45, dtick=5),
    legend=dict(x=1, y=1, traceorder='normal'),
    margin=dict(l=40, r=40, t=40, b=40),
    width=1200,
    height=800
)

# Show the interactive plot
fig.show()
No description has been provided for this image
No description has been provided for this image
Global Model Metrics:
AIC: 66.75678593359618
BIC: 78.69069021298182
MAE: 0.32980971252824776
MSE: 0.18012920615018987
P-Values: ar.L1     4.194622e-05
ar.L2     3.039029e-01
ar.L3     7.167704e-01
ar.L4     3.989121e-01
ar.L5     2.989243e-01
sigma2    5.376185e-07
dtype: float64

United States Model Metrics:
AIC: 186.63030996716685
BIC: 198.5642142465525
MAE: 0.9803119541338116
MSE: 1.4911672278157484
P-Values: ar.L1     1.471311e-08
ar.L2     1.893640e-05
ar.L3     6.882671e-02
ar.L4     6.793268e-02
ar.L5     3.102365e-01
sigma2    1.387313e-05
dtype: float64

China Model Metrics:
AIC: 173.48698502322495
BIC: 185.4208893026106
MAE: 0.8775012028734492
MSE: 1.1552942833915747
P-Values: ar.L1     1.479308e-07
ar.L2     1.520231e-05
ar.L3     7.624876e-02
ar.L4     1.598503e-01
ar.L5     8.518581e-01
sigma2    2.706560e-05
dtype: float64

India Model Metrics:
AIC: 103.31925559334901
BIC: 115.25315987273466
MAE: 0.43272104424346475
MSE: 0.3068332976098624
P-Values: ar.L1     4.101876e-08
ar.L2     1.270761e-02
ar.L3     2.816376e-01
ar.L4     2.959467e-01
ar.L5     4.592368e-02
sigma2    1.184048e-07
dtype: float64

Brazil Model Metrics:
AIC: 66.92930538450622
BIC: 78.86320966389187
MAE: 0.3291697228446682
MSE: 0.17983014624621144
P-Values: ar.L1     1.236477e-07
ar.L2     2.486680e-03
ar.L3     3.829143e-02
ar.L4     2.922882e-01
ar.L5     6.470128e-01
sigma2    7.041980e-07
dtype: float64

Russia Model Metrics:
AIC: 261.2066097344276
BIC: 273.1405140138133
MAE: 1.8402886965002734
MSE: 5.760850581114571
P-Values: ar.L1     3.984135e-10
ar.L2     2.211299e-02
ar.L3     1.840635e-01
ar.L4     3.176863e-01
ar.L5     9.111392e-01
sigma2    3.045046e-07
dtype: float64
Z-Score Outliers in Global Temperature Anomalies (threshold 1.5):
2022    0.674184
2017    0.551684
1975    5.141480
1973    5.279541
1971    5.267449
1970    5.532704
dtype: float64

IQR Outliers in Global Temperature Anomalies:
Series([], dtype: float64)
12:19:51 - cmdstanpy - INFO - Chain [1] start processing
12:19:52 - cmdstanpy - INFO - Chain [1] done processing
Ensemble Model Metrics:
MAE: 1.0009705753443017, MSE: 1.3663218154628598

In [ ]: