In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from plotly.subplots import make_subplots
import numpy as np
from sklearn.metrics import mean_absolute_error, mean_squared_error
from statsmodels.tsa.holtwinters import ExponentialSmoothing
from prophet import Prophet
import warnings
warnings.filterwarnings('ignore')
# Author: Mehul Parashar
# Load the dataset
file_path = 'surface temperature anomalies by year.csv'
data = pd.read_csv(file_path)
# Rename the first column to 'Country' and correct other columns
data.columns = ['Country'] + data.columns[1:].tolist()
# Extract relevant columns for analysis
years = [col for col in data.columns if col.isdigit()]
anomalies_data = data[['Country'] + years]
# Convert year columns to numeric, handling errors in the process
anomalies_data[years] = anomalies_data[years].apply(pd.to_numeric, errors='coerce')
# Drop any rows with all NaN values for years
anomalies_data = anomalies_data.dropna(subset=years, how='all')
# Calculate global mean temperature anomalies by year
global_anomalies = anomalies_data[years].mean()
# Apply a moving average to smooth the data
window_size = 5 # You can adjust the window size for more or less smoothing
smoothed_anomalies = global_anomalies.rolling(window=window_size).mean()
# Initial Visualization
plt.figure(figsize=(12, 6))
plt.plot(smoothed_anomalies.index, smoothed_anomalies.values, marker='o', linestyle='-', color='b')
plt.title('Global Surface Temperature Anomalies (Yearly Average)')
plt.xlabel('Year')
plt.ylabel('Temperature Anomaly (°C)')
plt.grid(True)
plt.xticks(rotation=90)
plt.show()
# List of selected countries
selected_countries = ['United States', 'China', 'India', 'Brazil', 'Russia']
# Extract data for the selected countries
selected_data = anomalies_data[anomalies_data['Country'].isin(selected_countries)]
# Plotting the temperature anomalies for selected countries
plt.figure(figsize=(14, 8))
for country in selected_countries:
country_data = selected_data[selected_data['Country'] == country].iloc[:, 1:].T
country_data.columns = [country]
plt.plot(country_data.index, country_data[country], marker='o', linestyle='-', label=country)
plt.title('Surface Temperature Anomalies for Selected Countries')
plt.xlabel('Year')
plt.ylabel('Temperature Anomaly (°C)')
plt.legend()
plt.grid(True)
plt.xticks(rotation=90)
plt.show()
# Historical events to highlight
historical_events = {
'Industrial Revolution': (1760, 1840),
'World War I': (1914, 1918),
'Great Depression': (1929, 1939),
'World War II': (1939, 1945),
'Oil Crisis': (1973, 1973),
'COVID-19 Pandemic': (2020, 2020)
}
# Initialize the figure
fig = go.Figure()
# Add country-specific traces
for country in selected_countries:
country_data = selected_data[selected_data['Country'] == country].iloc[:, 1:].T
country_data.columns = [country]
fig.add_trace(go.Scatter(x=country_data.index, y=country_data[country], mode='lines+markers', name=country))
# Add historical events as shaded areas
for event, (start, end) in historical_events.items():
fig.add_shape(
type="rect",
x0=start, x1=end,
y0=min(global_anomalies), y1=max(global_anomalies),
fillcolor="LightSalmon", opacity=0.5, layer="below", line_width=0,
name=event
)
fig.add_trace(go.Scatter(
x=[(start + end) / 2],
y=[max(global_anomalies)],
text=[event],
mode="text",
showlegend=False
))
# Update layout for dropdown menu
buttons = []
for country in selected_countries:
visible = [country == trace.name for trace in fig.data]
buttons.append(
dict(label=country,
method="update",
args=[{"visible": visible},
{"title": f"Temperature Anomalies: {country}"}]))
fig.update_layout(
updatemenus=[dict(active=0, buttons=buttons, x=1.15, xanchor="right", y=1.1, yanchor="top")],
title="Surface Temperature Anomalies with Significant Historical Events",
xaxis_title="Year",
yaxis_title="Temperature Anomaly (°C)",
showlegend=False
)
# Show the interactive plot
fig.show()
# Function to forecast temperature anomalies and calculate metrics
def forecast_anomalies(data, country, years, forecast_steps=20):
country_data = data[data['Country'] == country][years].T
if not country_data.empty:
country_data.columns = [country]
country_anomalies = country_data[country].dropna()
model = ARIMA(country_anomalies, order=(5, 1, 0))
model_fit = model.fit()
forecast = model_fit.forecast(steps=forecast_steps)
forecast_years = np.arange(int(country_anomalies.index.max()) + 1, int(country_anomalies.index.max()) + 1 + forecast_steps)
# Calculate metrics
aic = model_fit.aic
bic = model_fit.bic
mae = mean_absolute_error(country_anomalies, model_fit.predict(start=0, end=len(country_anomalies)-1))
mse = mean_squared_error(country_anomalies, model_fit.predict(start=0, end=len(country_anomalies)-1))
pvalues = model_fit.pvalues
metrics = {
'AIC': aic,
'BIC': bic,
'MAE': mae,
'MSE': mse,
'P-Values': pvalues
}
return country_anomalies, forecast, forecast_years, metrics
return None, None, None, None
# Prepare data for global anomalies
global_anomalies = anomalies_data[years].mean()
# Fit ARIMA model for global anomalies
global_model = ARIMA(global_anomalies, order=(5, 1, 0))
global_model_fit = global_model.fit()
# Forecast for global anomalies
global_forecast_steps = 20
global_forecast = global_model_fit.forecast(steps=global_forecast_steps)
global_forecast_years = np.arange(int(global_anomalies.index.max()) + 1, int(global_anomalies.index.max()) + 1 + global_forecast_steps)
# Calculate metrics for global anomalies
global_aic = global_model_fit.aic
global_bic = global_model_fit.bic
global_mae = mean_absolute_error(global_anomalies, global_model_fit.predict(start=0, end=len(global_anomalies)-1))
global_mse = mean_squared_error(global_anomalies, global_model_fit.predict(start=0, end=len(global_anomalies)-1))
global_pvalues = global_model_fit.pvalues
global_metrics = {
'AIC': global_aic,
'BIC': global_bic,
'MAE': global_mae,
'MSE': global_mse,
'P-Values': global_pvalues
}
# Print global metrics
print("Global Model Metrics:")
for key, value in global_metrics.items():
print(f"{key}: {value}")
# Create the figure
fig = go.Figure()
# Add global data trace
fig.add_trace(go.Scatter(
x=global_anomalies.index,
y=global_anomalies.values,
mode='lines+markers',
name='Global Historical Data'
))
fig.add_trace(go.Scatter(
x=global_forecast_years,
y=global_forecast,
mode='lines+markers',
name='Global Forecast',
line=dict(dash='dash', color='red')
))
# Add traces for each selected country and print their metrics
for country in selected_countries:
country_anomalies, country_forecast, country_forecast_years, country_metrics = forecast_anomalies(anomalies_data, country, years)
if country_anomalies is not None:
fig.add_trace(go.Scatter(
x=country_anomalies.index,
y=country_anomalies.values,
mode='lines+markers',
name=f'{country} Historical Data'
))
fig.add_trace(go.Scatter(
x=country_forecast_years,
y=country_forecast,
mode='lines+markers',
name=f'{country} Forecast',
line=dict(dash='dash', color='red')
))
# Print country metrics
print(f"\n{country} Model Metrics:")
for key, value in country_metrics.items():
print(f"{key}: {value}")
# Update the layout
fig.update_layout(
title='Surface Temperature Anomalies with Future Projections for Selected Countries',
xaxis_title='Year',
yaxis_title='Temperature Anomaly (°C)',
xaxis=dict(tickangle=45, dtick=5),
legend=dict(x=1, y=1, traceorder='normal'),
margin=dict(l=40, r=40, t=40, b=40),
width=1000,
height=600
)
# Show the interactive plot
fig.show()
# Function to calculate Z-scores
def calculate_z_scores(data):
mean = np.mean(data)
std = np.std(data)
z_scores = (data - mean) / std
return z_scores
# Function to calculate IQR and identify outliers
def calculate_iqr_outliers(data):
Q1 = np.percentile(data, 25)
Q3 = np.percentile(data, 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
outliers = data[(data < lower_bound) | (data > upper_bound)]
return outliers
# Calculate Z-scores for global anomalies with threshold 1.5
global_z_scores = calculate_z_scores(global_anomalies)
z_score_outliers = global_anomalies[np.abs(global_z_scores) > 1.5]
# Calculate IQR outliers for global anomalies
iqr_outliers = calculate_iqr_outliers(global_anomalies)
# Print outliers
print("Z-Score Outliers in Global Temperature Anomalies (threshold 1.5):")
print(z_score_outliers)
print("\nIQR Outliers in Global Temperature Anomalies:")
print(iqr_outliers)
# Plot global anomalies and highlight outliers
fig = go.Figure()
fig.add_trace(go.Scatter(
x=global_anomalies.index,
y=global_anomalies.values,
mode='lines+markers',
name='Global Anomalies'
))
fig.add_trace(go.Scatter(
x=z_score_outliers.index,
y=z_score_outliers.values,
mode='markers',
marker=dict(color='red', size=10),
name='Z-Score Outliers'
))
fig.add_trace(go.Scatter(
x=iqr_outliers.index,
y=iqr_outliers.values,
mode='markers',
marker=dict(color='orange', size=10),
name='IQR Outliers'
))
# Update layout
fig.update_layout(
title='Global Surface Temperature Anomalies with Outliers',
xaxis_title='Year',
yaxis_title='Temperature Anomaly (°C)',
xaxis=dict(tickangle=45, dtick=5),
margin=dict(l=40, r=40, t=40, b=40),
width=1000,
height=600
)
# Show the interactive plot
fig.show()
# Fit ARIMA model for global anomalies
arima_model = ARIMA(global_anomalies, order=(5, 1, 0))
arima_fit = arima_model.fit()
arima_forecast = arima_fit.forecast(steps=20)
# Fit ETS model for global anomalies
ets_model = ExponentialSmoothing(global_anomalies, trend='add', seasonal=None)
ets_fit = ets_model.fit()
ets_forecast = ets_fit.forecast(steps=20)
# Prepare data for Prophet model
prophet_data = global_anomalies.reset_index()
prophet_data.columns = ['ds', 'y']
prophet_model = Prophet()
prophet_model.fit(prophet_data)
future = prophet_model.make_future_dataframe(periods=20, freq='Y')
prophet_forecast = prophet_model.predict(future)
# Create an ensemble forecast by averaging the forecasts
ensemble_forecast = (arima_forecast + ets_forecast + prophet_forecast['yhat'][-20:].values) / 3
# Calculate metrics for the ensemble model
ensemble_true = global_anomalies.values[-20:]
ensemble_mae = mean_absolute_error(ensemble_true, ensemble_forecast)
ensemble_mse = mean_squared_error(ensemble_true, ensemble_forecast)
print("Ensemble Model Metrics:")
print(f"MAE: {ensemble_mae}, MSE: {ensemble_mse}\n")
# Create the figure for comparison
fig = go.Figure()
# Add historical data trace
fig.add_trace(go.Scatter(
x=global_anomalies.index,
y=global_anomalies.values,
mode='lines+markers',
name='Historical Data'
))
# Add ARIMA forecast trace
fig.add_trace(go.Scatter(
x=np.arange(int(global_anomalies.index.max()) + 1, int(global_anomalies.index.max()) + 1 + 20),
y=arima_forecast,
mode='lines+markers',
name='ARIMA Forecast',
line=dict(dash='dash')
))
# Add ETS forecast trace
fig.add_trace(go.Scatter(
x=np.arange(int(global_anomalies.index.max()) + 1, int(global_anomalies.index.max()) + 1 + 20),
y=ets_forecast,
mode='lines+markers',
name='ETS Forecast',
line=dict(dash='dot')
))
# Add Prophet forecast trace
fig.add_trace(go.Scatter(
x=prophet_forecast['ds'][-20:],
y=prophet_forecast['yhat'][-20:],
mode='lines+markers',
name='Prophet Forecast',
line=dict(dash='dashdot')
))
# Add Ensemble forecast trace
fig.add_trace(go.Scatter(
x=np.arange(int(global_anomalies.index.max()) + 1, int(global_anomalies.index.max()) + 1 + 20),
y=ensemble_forecast,
mode='lines+markers',
name='Ensemble Forecast',
line=dict(dash='solid', color='black')
))
# Update the layout
fig.update_layout(
title='Global Surface Temperature Anomalies with Different Forecasting Models',
xaxis_title='Year',
yaxis_title='Temperature Anomaly (°C)',
xaxis=dict(tickangle=45, dtick=5),
legend=dict(x=1, y=1, traceorder='normal'),
margin=dict(l=40, r=40, t=40, b=40),
width=1200,
height=800
)
# Show the interactive plot
fig.show()
# Function to forecast temperature anomalies for a country
def forecast_anomalies(data, country, years, forecast_steps=20):
country_data = data[data['Country'] == country][years].T
if not country_data.empty:
country_data.columns = [country]
country_anomalies = country_data[country].dropna()
model = ARIMA(country_anomalies, order=(5, 1, 0))
model_fit = model.fit()
forecast = model_fit.forecast(steps=forecast_steps)
forecast_years = np.arange(int(country_anomalies.index.max()) + 1, int(country_anomalies.index.max()) + 1 + forecast_steps)
return country_anomalies, forecast, forecast_years
return None, None, None
# Add traces for each selected country
for country in selected_countries:
country_anomalies, country_forecast, country_forecast_years = forecast_anomalies(anomalies_data, country, years)
if country_anomalies is not None:
fig.add_trace(go.Scatter(
x=country_anomalies.index,
y=country_anomalies.values,
mode='lines+markers',
name=f'{country} Historical Data'
))
fig.add_trace(go.Scatter(
x=country_forecast_years,
y=country_forecast,
mode='lines+markers',
name=f'{country} Forecast',
line=dict(dash='dash', color='red')
))
# Update the layout for all data
fig.update_layout(
title='Surface Temperature Anomalies with Future Projections for Selected Countries',
xaxis_title='Year',
yaxis_title='Temperature Anomaly (°C)',
xaxis=dict(tickangle=45, dtick=5),
legend=dict(x=1, y=1, traceorder='normal'),
margin=dict(l=40, r=40, t=40, b=40),
width=1200,
height=800
)
# Show the interactive plot
fig.show()
Global Model Metrics: AIC: 66.75678593359618 BIC: 78.69069021298182 MAE: 0.32980971252824776 MSE: 0.18012920615018987 P-Values: ar.L1 4.194622e-05 ar.L2 3.039029e-01 ar.L3 7.167704e-01 ar.L4 3.989121e-01 ar.L5 2.989243e-01 sigma2 5.376185e-07 dtype: float64 United States Model Metrics: AIC: 186.63030996716685 BIC: 198.5642142465525 MAE: 0.9803119541338116 MSE: 1.4911672278157484 P-Values: ar.L1 1.471311e-08 ar.L2 1.893640e-05 ar.L3 6.882671e-02 ar.L4 6.793268e-02 ar.L5 3.102365e-01 sigma2 1.387313e-05 dtype: float64 China Model Metrics: AIC: 173.48698502322495 BIC: 185.4208893026106 MAE: 0.8775012028734492 MSE: 1.1552942833915747 P-Values: ar.L1 1.479308e-07 ar.L2 1.520231e-05 ar.L3 7.624876e-02 ar.L4 1.598503e-01 ar.L5 8.518581e-01 sigma2 2.706560e-05 dtype: float64 India Model Metrics: AIC: 103.31925559334901 BIC: 115.25315987273466 MAE: 0.43272104424346475 MSE: 0.3068332976098624 P-Values: ar.L1 4.101876e-08 ar.L2 1.270761e-02 ar.L3 2.816376e-01 ar.L4 2.959467e-01 ar.L5 4.592368e-02 sigma2 1.184048e-07 dtype: float64 Brazil Model Metrics: AIC: 66.92930538450622 BIC: 78.86320966389187 MAE: 0.3291697228446682 MSE: 0.17983014624621144 P-Values: ar.L1 1.236477e-07 ar.L2 2.486680e-03 ar.L3 3.829143e-02 ar.L4 2.922882e-01 ar.L5 6.470128e-01 sigma2 7.041980e-07 dtype: float64 Russia Model Metrics: AIC: 261.2066097344276 BIC: 273.1405140138133 MAE: 1.8402886965002734 MSE: 5.760850581114571 P-Values: ar.L1 3.984135e-10 ar.L2 2.211299e-02 ar.L3 1.840635e-01 ar.L4 3.176863e-01 ar.L5 9.111392e-01 sigma2 3.045046e-07 dtype: float64
Z-Score Outliers in Global Temperature Anomalies (threshold 1.5): 2022 0.674184 2017 0.551684 1975 5.141480 1973 5.279541 1971 5.267449 1970 5.532704 dtype: float64 IQR Outliers in Global Temperature Anomalies: Series([], dtype: float64)
12:19:51 - cmdstanpy - INFO - Chain [1] start processing 12:19:52 - cmdstanpy - INFO - Chain [1] done processing
Ensemble Model Metrics: MAE: 1.0009705753443017, MSE: 1.3663218154628598
In [ ]: