import matplotlib.pylab as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score, KFold
from sklearn.tree import DecisionTreeRegressor


# Loads data from league table csv file for the specified year and formats it so that it can be added to the main dataset
def load_league_table(year):
    # Read in league table csv for the year into a pandas dataframe
    league_table_data = pd.read_csv("/home/jovyan/notebooks/data/" + str(year) + "_league_table.csv")
    # Drop unnecessary columns
    league_table_data.drop(columns=['Attendance', 'Top Team Scorer', 'Goalkeeper', 'Notes'], inplace=True)
    return league_table_data

# Loads data from standard csv file for the specified year and formats it so that it can be added to the main dataset
def load_standard(year):
    # Read in standard csv file for the year into a pandas dataframe
    standard_data = pd.read_csv("/home/jovyan/notebooks/data/" + str(year) + "_standard.csv", header=1)
    # Drop unnecessary columns
    standard_data.drop(columns=['MP', 'Starts', 'Min'], inplace=True)
    # Rename columns that use the same name as columns from the other two files
    standard_data.rename(columns={'Gls.1':'Gls/90', 'Ast.1':'Ast/90', 'G+A': 'G+A/90', 'G-PK':'G-PK/90', 'G+A-PK':'G+A-PK/90'}, inplace=True)
    return standard_data

# Loads data from goalkeeping csv file for the specified year and formats it so that it can be added to the main dataset
def load_goalkeeping(year):
    # Read in goalkeeping csv file for the year into a pandas dataframe
    goalkeeping_data = pd.read_csv("/home/jovyan/notebooks/data/" + str(year) + "_goalkeeping.csv", header=1)
    # Drop unnecessary columns
    goalkeeping_data.drop(columns=['# Pl', 'MP', 'Starts', 'Min', 'GA', 'W', 'D', 'L'], inplace=True)
    # Rename columns that use the same name as columns from the other two files
    goalkeeping_data.rename(columns={'PKatt':'PKatt_goal'}, inplace=True)
    return goalkeeping_data

# Create DataFrame to store merged data
full_data = pd.DataFrame()
# List of all the years we have in the dataset with the years in the form the files have them
historical_data_years = [92,93,94,95,96,97,98,99,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19]
# Loop through each year in our dataset, loading the three csv files for the year and merging them into the full dataset
for year in historical_data_years:
    # Load league table data for the year
    league_table_data = load_league_table(year)
    # Load standard table data for the year
    standard_data = load_standard(year)
    # Load goalkeeping table data for the year
    goalkeeping_data = load_goalkeeping(year)

    # Merge the three dataframes into one, merged dataframe
    merged_data = league_table_data.merge(standard_data, left_on='Squad', right_on='Squad')
    merged_data = merged_data.merge(goalkeeping_data, left_on='Squad', right_on='Squad')
    # Convert the year to a full year format and append it to the data
    full_year = year
    if year >= 92:
        full_year = int('19' + str(year))
    elif year < 10:
        full_year = int('200' + str(year))
    else:
        full_year = int('20' + str(year))
    merged_data['year'] = full_year
    # Append the year's data to the rest of the dataset
    full_data = full_data.append(merged_data, ignore_index=True)

# Drop repeat xG Column here since it is not in every standard csv file so if we drop it in the load_standard it will throw an error
full_data.drop(columns=['xG_y'], inplace=True)
# Rename any columns names that have been repeated to the correct name since some are for overall season stats and some are per 90 minutes
full_data.rename(columns={'xG_x': 'xG','xG.1': 'xG/90','xA.1': 'xA/90','xG+xA': 'xG+xA/90','npxG.1': 'npxg/90', 'npxG+xA': 'npxG+xA/90'}, inplace=True)
# Reorder columns so that year and squad are in front for better readability 
full_data = full_data[['year', 'Squad', 'Rk', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GDiff', 'Pts', '# Pl', 'Age', \
                       'Poss', 'Gls', 'Ast', 'PK', 'PKatt', 'CrdY', 'CrdR', 'Gls/90', 'Ast/90', 'G+A/90', \
                       'G-PK/90', 'G+A-PK/90', 'GA90', 'SoTA', 'Saves', 'Save%','CS', 'CS%',  'PKatt_goal', \
                       'PKA', 'PKsv', 'PKm', 'xG', 'xGA', 'xGDiff', 'xGDiff/90', 'npxG', 'xA', 'xG/90', 'xA/90', \
                       'xG+xA/90', 'npxg/90', 'npxG+xA/90']]
full_data.head(20)


# Create regression plot of year vs. points
ax = sns.regplot(x=full_data['year'], y=full_data['Pts'])
# Add axis labels
ax.set(xlabel="Year", ylabel="Points")
# Add plot title
ax.set_title("Plot of Year vs. Points")

Text(0.5, 1.0, 'Plot of Year vs. Points')


# Adjust plot size so its easier to read
plt.figure(figsize=(20,6))
# Create violinplot of the distribution of points by year
ax = sns.violinplot(data=full_data, x="year", y="Pts")
# Add axis labels
ax.set(xlabel="Year", ylabel="Points")
# Add plot title
ax.set_title("Plot of the Distribution of Points by Year")

Text(0.5, 1.0, 'Plot of the Distribution of Points by Year')


# Create regression plot of year vs. number of wins
ax = sns.regplot(x=full_data['year'], y=full_data['W'])
# Add axis labels
ax.set(xlabel="Year", ylabel="Wins")
# Add plot title
ax.set_title("Plot of Year vs. Number of Wins")

Text(0.5, 1.0, 'Plot of Year vs. Number of Wins')


# Create regression plot of year vs. number of draws
ax = sns.regplot(x=full_data['year'], y=full_data['D'])
# Add axis labels
ax.set(xlabel="Year", ylabel="Draws")
# Add plot title
ax.set_title("Plot of Year vs. Number of Draws")

Text(0.5, 1.0, 'Plot of Year vs. Number of Draws')


# Create regression plot of year vs. number of losses
ax = sns.regplot(x=full_data['year'], y=full_data['L'])
# Add axis labels
ax.set(xlabel="Year", ylabel="Losses")
# Add plot title
ax.set_title("Plot of Year vs. Number of Losses")

Text(0.5, 1.0, 'Plot of Year vs. Number of Losses')


# Create scatter plot of year vs. number of matches played
ax = sns.scatterplot(x=full_data['year'], y=full_data['MP'])
# Add axis labels
ax.set(xlabel="Year", ylabel="Matches")
# Add plot title
ax.set_title("Plot of Year vs. Number of Matches")

Text(0.5, 1.0, 'Plot of Year vs. Number of Matches')


# Create column for points per match
full_data['Pts/M'] = pd.NA
# Calculate points per match for each row
for i, row in full_data.iterrows():
    full_data.at[i, 'Pts/M'] = row['Pts'] / row['MP']
# Convert points per match column data type to float64 from object
full_data = full_data.astype({'Pts/M':'float64'})
# Create regression plot of year vs. points per match
ax = sns.regplot(x=full_data['year'], y=full_data['Pts/M'])
# Add axis labels
ax.set(xlabel="Year", ylabel="Points per Match")
# Add plot title
ax.set_title("Plot of Year vs. Points per Match")

Text(0.5, 1.0, 'Plot of Year vs. Points per Match')


# Create scatter plot of goals per match vs. rank in league table
ax = sns.scatterplot(x=full_data['Rk'], y=full_data['Gls/90'])
# Add axis labels
ax.set(xlabel="Rank in League Table", ylabel="Goals per Match")
# Add plot title
ax.set_title("Plot of Rank in League Table vs. Goals per Match")

Text(0.5, 1.0, 'Plot of Rank in League Table vs. Goals per Match')


# Create scatter plot of assists per match vs. rank in league table
ax = sns.scatterplot(x=full_data['Rk'], y=full_data['Ast/90'])
# Add axis labels
ax.set(xlabel="Rank in League Table", ylabel="Assists per 90 Minutes")
# Add plot title
ax.set_title("Plot of Rank in League Table vs. Assists per 90 Minutes")

Text(0.5, 1.0, 'Plot of Rank in League Table vs. Assists per 90 Minutes')


# Create scatter plot of goals allowed per 90 minutes vs. rank in league table
ax = sns.scatterplot(x=full_data['Rk'], y=full_data['GA90'])
# Add axis labels
ax.set(xlabel="Rank in League Table", ylabel="Goals Allowed per Match")
# Add plot title
ax.set_title("Plot of Rank in League Table vs. Goals Allowed per Match")

Text(0.5, 1.0, 'Plot of Rank in League Table vs. Goals Allowed per Match')


# Create column for goal difference per match
full_data['GDiff/M'] = 0.0
# Calculate goal difference per match for every row
for i, row in full_data.iterrows():
    full_data.at[i, 'GDiff/M'] = row['GDiff'] / row['MP']
# Create scatter plot of goal difference per match vs. rank in league table
ax = sns.scatterplot(x=full_data['Rk'], y=full_data['GDiff/M'])
# Add axis labels
ax.set(xlabel="Rank in League Table", ylabel="Goal Difference er Match")
# Add plot title
ax.set_title("Plot of Rank in League Table vs. Goal Difference per Match")

Text(0.5, 1.0, 'Plot of Rank in League Table vs. Goal Difference per Match')


# Create scatter plot of goalkeeper save percentage vs. rank in league table
ax = sns.scatterplot(x=full_data['Rk'], y=full_data['Save%'])
# Add axis labels
ax.set(xlabel="Rank in League Table", ylabel="Goalkeeper Save Percentage")
# Add plot title
ax.set_title("Plot of Rank in League Table vs. Goalkeeper Save Percentage")

Text(0.5, 1.0, 'Plot of Rank in League Table vs. Goalkeeper Save Percentage')


# Create scatter plot of clean sheet percentage vs. rank in league table
ax = sns.scatterplot(x=full_data['Rk'], y=full_data['CS%'])
# Add axis labels
ax.set(xlabel="Rank in League Table", ylabel="Clean Sheet Percentage")
# Add plot title
ax.set_title("Plot of Rank in League Table vs. Clean Sheet Percentage")

Text(0.5, 1.0, 'Plot of Rank in League Table vs. Clean Sheet Percentage')


# Create column for shots on target allowed per match
full_data['SoTA/M'] = 0.0
# Calculate shots on target allowed per match for each row
for i, row in full_data.iterrows():
    full_data.at[i, 'SoTA/M'] = row['SoTA'] / row['MP']
# Create scatter plot of shots on target allowed per match vs. rank in the league table
ax = sns.scatterplot(x=full_data['Rk'], y=full_data['SoTA/M'])
# Add axis labels
ax.set(xlabel="Rank in League Table", ylabel="Shots on Target Allowed Per Match")
# Add plot title
ax.set_title("Plot of Rank in League Table vs. Shots on Target Allowed Per Match")

Text(0.5, 1.0, 'Plot of Rank in League Table vs. Shots on Target Allowed Per Match')


# Create column for penalty kicks per match
full_data['PK/M'] = 0.0
# Create column for penalty kick attempts per match
full_data['PKatt/M'] = 0.0
# Create column for yellow cards per match
full_data['CrdY/M'] = 0.0
# Create column for red cards per match
full_data['CrdR/M'] = 0.0
# Calculate values for each of the new columns for each row
for i, row in full_data.iterrows():
    num_matches = row['MP']
    full_data.at[i, 'PK/M'] = row['PK'] / num_matches
    full_data.at[i, 'PKatt/M'] = row['PKatt'] / num_matches
    full_data.at[i, 'CrdY/M'] = row['CrdY'] / num_matches
    full_data.at[i, 'CrdR/M'] = row['CrdR'] / num_matches
# Create dataframe of only the potential predictors and league table rank
potential_predictors = full_data[['Rk', 'GDiff/M', '# Pl', 'Age', 'Gls/90', 'Ast/90', 'GA90', 'Save%', 'CS%', \
                                  'PK/M', 'PKatt/M', 'CrdY/M', 'CrdR/M', 'SoTA/M']]
# Calculate correltations of the potential predictors with league table rank
correlations = potential_predictors[['GDiff/M', '# Pl', 'Age', 'Gls/90', 'Ast/90', 'GA90', 'Save%', 'CS%', \
                                     'PK/M', 'PKatt/M', 'CrdY/M', 'CrdR/M', 'SoTA/M']].corrwith(potential_predictors['Rk'])
# Adjust figure size to make it easier to read
plt.figure(figsize=(6,20))
# Create heatmap of the correlation matrix
ax = sns.heatmap(correlations.to_frame(name='Rank'), annot=True)
# Set heatmap title
ax.set_title("Correlation with League Table Rank")

Text(0.5, 1.0, 'Correlation with League Table Rank')


# Make dataframe of finalized predictors and league table rank
predictors = potential_predictors[['Rk', 'GDiff/M', 'Gls/90', 'Ast/90', 'GA90', 'Save%', 'CS%', 'SoTA/M']]
# Calculate correlations of each predictor with each other
correlations = predictors[['GDiff/M', 'Gls/90', 'Ast/90', 'GA90', 'Save%', 'CS%', 'SoTA/M']].corr()
# Adjust plot size so that it is easier to read
plt.figure(figsize=(15,15))
# Make a heatmap of the correlation matrix
ax = sns.heatmap(correlations, annot=True)
# Set title of the plot
ax.set_title("Predictors Correlation with Each Other")

Text(0.5, 1.0, 'Predictors Correlation with Each Other')


# Split predictors dataframe into X,the independent variables for the model, and Y, the dependent variable
X = predictors[['GDiff/M', 'Save%', 'CS%', 'SoTA/M']]
Y = predictors[['Rk']]
# Make regressor objects for the three regressors
linear_regr = LinearRegression()
decision_tree = DecisionTreeRegressor()
random_forest = RandomForestRegressor()
# Run the three regressors through a 10-fold cross-validation
linear_regr_scores = cross_val_score(linear_regr, X, Y, cv=KFold(n_splits=10, shuffle=True))
decision_tree_scores = cross_val_score(decision_tree, X, Y, cv=KFold(n_splits=10, shuffle=True))
random_forest_scores = cross_val_score(random_forest, X, np.ravel(Y), cv=KFold(n_splits=10, shuffle=True))
print("Linear Regression:       " + str(linear_regr_scores.mean()))
print("Decision Tree Regressor: " + str(decision_tree_scores.mean()))
print("Random Forest Regressor: " + str(random_forest_scores.mean()))

Linear Regression:       0.8155856849366886
Decision Tree Regressor: 0.782014806782317
Random Forest Regressor: 0.8579953062155872


# Load the three csv files for the current season
league_table_data = load_league_table(20)
standard_data = load_standard(20)
goalkeeping_data = load_goalkeeping(20)
# Merge the three dataframes into one 
merged_data = league_table_data.merge(standard_data, left_on='Squad', right_on='Squad')
current_season_data = merged_data.merge(goalkeeping_data, left_on='Squad', right_on='Squad')
# Drop the repeat xG column
current_season_data.drop(columns=['xG_y'], inplace=True)
# Rename the columns that have the same names as other column to their correct names
current_season_data.rename(columns={'xG_x': 'xG','xG.1': 'xG/90','xA.1': 'xA/90','xG+xA': 'xG+xA/90',\
                                    'npxG.1': 'npxg/90', 'npxG+xA': 'npxG+xA/90'}, inplace=True)
# Reorder data so that it is in a more readable order
current_season_data = current_season_data[['Squad', 'Rk', 'MP', 'W', 'D', 'L', 'GF', 'GA', 'GDiff', 'Pts', \
                                           '# Pl', 'Age', 'Poss', 'Gls', 'Ast', 'PK', 'PKatt', 'CrdY', 'CrdR', \
                                           'Gls/90', 'Ast/90', 'G+A/90', 'G-PK/90', 'G+A-PK/90', 'GA90', 'SoTA', \
                                           'Saves', 'Save%','CS', 'CS%',  'PKatt_goal', 'PKA', 'PKsv', 'PKm', \
                                           'xG', 'xGA', 'xGDiff', 'xGDiff/90', 'npxG', 'xA', 'xG/90', 'xA/90', \
                                           'xG+xA/90', 'npxg/90', 'npxG+xA/90']]
# Create columns for shots on target allowed per match and goal difference per match for the current season data
current_season_data['SoTA/M'] = 0.0
current_season_data['GDiff/M'] = 0.0
# Calculate shots on target allowed per match and goal difference per match for each row in the current season data
for i, row in current_season_data.iterrows():
    num_matches = row['MP']
    current_season_data.at[i, 'GDiff/M'] = row['GDiff'] / num_matches
    current_season_data.at[i, 'SoTA/M'] = row['SoTA'] / num_matches

# Create Random Forest regressor object
random_forest = RandomForestRegressor()
# Fit the Random Forest regressor object to the historical data
random_forest.fit(X,np.ravel(Y))
# Use Random Forest regressor to predict the end of year rank for each team
results = []
for i, row in current_season_data.iterrows():
    results.append((row['Squad'], random_forest.predict([row[['GDiff/M', 'Save%', 'CS%', 'SoTA/M']]])[0]))
results.sort(key=lambda x: x[1])
for i in results:
    print(i)

('Tottenham', 2.36)
('Aston Villa', 2.88)
('Chelsea', 3.4)
('Leicester City', 3.81)
('Liverpool', 4.81)
('Manchester City', 5.48)
('Southampton', 5.81)
('West Ham', 5.94)
('Everton', 7.04)
('Crystal Palace', 8.0)
('Manchester Utd', 8.67)
('Newcastle Utd', 11.83)
('Leeds United', 12.71)
('Wolves', 13.6)
('Arsenal', 15.28)
('Brighton', 17.67)
('Burnley', 18.38)
('Fulham', 19.0)
('West Brom', 20.34)
('Sheffield Utd', 21.08)


# Create dataframe of the predictors using data since 2017 which is the beginning of the expected data statistics
X = full_data[full_data['year'] >= 2017]
X = X[['GDiff/M','xGDiff/90', 'Save%', 'CS%', 'SoTA/M']]
# Create dataframe of the ranks since 2017 to match the predictors
Y = full_data[full_data['year'] >= 2017]
Y = Y[['Rk']]
# Create Random Forest regressor object
random_forest = RandomForestRegressor()
# Fit the Random Forest regressor to the historical data
random_forest.fit(X,np.ravel(Y))
# Calculate xGA per match for each row in the current season data
for i, row in current_season_data.iterrows():
    num_matches = row['MP']
    current_season_data.at[i, 'xGA/M'] = row['xGA'] / num_matches
# Use Random Forest regressor to predict the end of year rank for each team
results = []
for i, row in current_season_data.iterrows():
    results.append((row['Squad'], random_forest.predict([row[['GDiff/M','xGDiff/90', 'Save%', 'CS%', 'SoTA/M']]])[0]))
results.sort(key=lambda x: x[1])
for i in results:
    print(i)

('Chelsea', 3.25)
('Aston Villa', 3.54)
('Tottenham', 3.57)
('Liverpool', 3.69)
('Leicester City', 4.35)
('Manchester City', 4.52)
('Southampton', 5.37)
('West Ham', 5.6)
('Everton', 7.69)
('Crystal Palace', 8.11)
('Manchester Utd', 8.44)
('Newcastle Utd', 10.4)
('Leeds United', 11.96)
('Arsenal', 13.47)
('Wolves', 13.5)
('Brighton', 15.49)
('Fulham', 18.38)
('Burnley', 18.47)
('West Brom', 18.72)
('Sheffield Utd', 18.94)

	year	Squad	Rk	MP	W	D	L	GF	GA	GDiff	...	xGA	xGDiff	xGDiff/90	npxG	xA	xG/90	xA/90	xG+xA/90	npxg/90	npxG+xA/90
0	1992	Manchester Utd	1	42	24	12	6	67	31	36	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
1	1992	Aston Villa	2	42	21	11	10	57	40	17	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
2	1992	Norwich City	3	42	21	9	12	61	65	-4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
3	1992	Blackburn	4	42	20	11	11	68	46	22	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
4	1992	QPR	5	42	17	12	13	63	55	8	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
5	1992	Liverpool	6	42	16	11	15	62	55	7	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
6	1992	Sheffield Weds	7	42	15	14	13	55	51	4	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
7	1992	Tottenham	8	42	16	11	15	60	66	-6	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
8	1992	Manchester City	9	42	15	12	15	56	51	5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
9	1992	Arsenal	10	42	15	11	16	40	38	2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
10	1992	Chelsea	11	42	14	14	14	51	54	-3	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
11	1992	Wimbledon	12	42	14	12	16	56	55	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
12	1992	Everton	13	42	15	8	19	53	55	-2	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
13	1992	Sheffield Utd	14	42	14	10	18	54	53	1	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
14	1992	Coventry City	15	42	13	13	16	52	57	-5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
15	1992	Ipswich Town	16	42	12	16	14	50	55	-5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
16	1992	Leeds United	17	42	12	15	15	57	62	-5	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
17	1992	Southampton	18	42	13	11	18	54	61	-7	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
18	1992	Oldham Athletic	19	42	13	10	19	63	74	-11	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN
19	1992	Crystal Palace	20	42	11	16	15	48	61	-13	...	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN	NaN

Predicting 2020-2021 English Premier League Table Results Using Machine Learning

By William Daseking

Importing the Necessary Python Libraries¶

Loading Our Dataset¶

Data Dictionary¶

Exploratory Data Analysis¶

Scatter Plot of Year vs. Points¶

Violin Plot of Points by Year¶

Scatter Plot of the Number of Wins for a Team by Year¶

Scatter Plot of the Number of Draws for a Team by Year¶

Scatter Plot of the Number of Losses for a Team by Year¶

Scatter Plot of Number of Matches for a Team by Year¶

Scatter Plot of Points per Match for a Team by Year¶

Scatter Plot of Goals per Match vs. Rank in League Table¶

Scatter Plot of Assists per Match vs. Rank in League Table¶

Scatter Plot of Goals Allowed per Match vs. Rank in League Table¶

Scatter Plot of Goal Difference per Match vs. Rank in League Table¶

Scatter Plot of Goalkeeper Save Percentage vs. Rank in League Table¶

Scatter Plot of Clean Sheet Percentage vs. Rank in League Table¶

Scatter Plot of Shots on Target Allowed Per Match vs. Rank in League Table¶

Deciding Which Predictors To Use¶

Building and Testing the Model¶

Predicting the 2020-2021 Premier League Table¶

A New Statistic Enters the Field¶

Conclusion¶