Prior Paper

Prediction for 2006 Germany World Cup using Bradley-Terry Model.pdf

Data Collection

Original paper used matches from recent 20 years, we will collect 10 years data from 2013-2022.

Used Kaggle’s data, from https://www.kaggle.com/datasets/martj42/international-football-results-from-1872-to-2017

Used some python code for data preprocessing:

country_lisy.py

import pandas as pd
import numpy as np
import country_list as cl
import random as rd

## Country List

# Worldcup country list
country_a = ['Qatar', 'Ecuador', 'Senegal', 'Netherlands']
country_b = ['England', 'Iran', 'United States', 'Wales']
country_c = ['Argentina', 'Saudi Arabia', 'Mexico', 'Poland']
country_d = ['France', 'Australia', 'Denmark', 'Tunisia']
country_e = ['Spain', 'Costa Rica', 'Germany', 'Japan']
country_f = ['Belgium', 'Canada', 'Morocco', 'Croatia']
country_g = ['Brazil', 'Serbia', 'Switzerland', 'Cameroon']
country_h = ['Portugal', 'Ghana', 'Uruguay', 'South Korea']

# fifa ranking top 100 (2022.10.12)
country_all = ['Brazil', 'Belgium', 'Argentina', 'France', 'England',\\
               'Italy', 'Spain', 'Netherlands', 'Portugal', 'Denmark', \\
               'Germany', 'Croatia', 'Mexico', 'Uruguay', 'Switzerland', \\
               'United States', 'Colombia', 'Senegal', 'Wales', 'Iran', \\
               'Serbia', 'Morocco', 'Peru', 'Japan', 'Sweden', \\
               'Poland', 'Ukraine', 'South Korea', 'Chile', 'Tunisia', \\
               'Costa Rica', 'Nigeria', 'Russia', 'Austria', 'Czech Republic', \\
               'Hungary', 'Algeria', 'Australia', 'Egypt', 'Scotland', \\
               'Canada', 'Norway', 'Cameroon', 'Ecuador', 'Turkey', \\
               'Mali', 'Paraguay', 'Ivory Coast', 'Republic of Ireland', 'Qatar', \\
               'Saudi Arabia', 'Greece', 'Romania', 'Burkina Faso', 'Slovakia', \\
               'Finland', 'Venezuela', 'Bosnia and Herzegovina', 'Northern Ireland', 'Panama', \\
               'Ghana', 'Iceland', 'Slovenia', 'Jamaica', 'North Macedonia', \\
               'Albania', 'South Africa', 'Iraq', 'Montenegro', 'United Arab Emirates', \\
               'Bulgaria', 'El Salvador', 'Oman', \\
               'Israel', 'Uzbekistan', 'Georgia', 'China PR', 'Honduras', \\
               'Gabon', 'Bolivia', 'Guinea', 'Jordan', 'Bahrain', \\
               'Haiti', 'Zambia', 'Uganda', 'Syria', \\
               'Benin', 'Luxembourg', 'Armenia', 'Palestine', 'Kyrgyzstan', \\
               'Vietnam', 'Belarus', 'Equatorial Guinea', 'Lebanon', 'Congo']

country_dic = {string: i for i, string in enumerate(country_all)}

csv_edit.py

import pandas as pd
import numpy as np
import country_list as cl
import random as rd

# read csv
data = pd.read_csv('csv/results.csv')

# time filtering
start_time = '2001-01-01'
end_time = '2022-12-31'
is_valid_date = (start_time <= data['date']) & (data['date'] <= end_time)
data = data[is_valid_date]

# country filtering
is_valid_country = (data['home_team'].isin(country_all)) & (data['away_team'].isin(country_all))
data = data[is_valid_country]

# define match type
# home team win = 0 / draw = 1 / away team win = 2
match_type = np.select([data['home_score'] > data['away_score'], data['home_score'] == data['away_score'], data['home_score'] < data['away_score']], [0, 1, 2], default=np.nan)
data['match_type'] = match_type

# delete tournament / city column / country 
data.drop(['date', 'home_score', 'away_score', 'tournament', 'city', 'country'], axis=1, inplace=True)

# print informations
print(data)

data.to_csv('csv/data.csv')

# to check whether there is INVALID country which has no record of matches
data = pd.read_csv('csv/data.csv')
data_lst = [list(row) for row in data.values]
data_len = len(data_lst)
tmp_record=np.zeros(100)
for i in range(data_len):
    home_team = country_dic[data_lst[i][1]]
    away_team = country_dic[data_lst[i][2]]
    tmp_record[home_team]+=1
    tmp_record[away_team]+=1
print("TOTAL LENGTH: "+str(len(country_all)))
for i in range(len(country_all)):
    if tmp_record[i]==0:
        print(" INVALID "+country_all[i]+" !! ")

Result

TOTAL LENGTH: 97

Estimation Model Adaptation

Variables

Team A’s worth parameter: $\gamma_{A}$

Parameter for draw: $\lambda$

Parameter for home advantage: $\delta, h$