import sqlite3
import pandas
from matplotlib import pyplot as plt
import math
import numpy as np
# connect to database
sqlite_file = 'lahman2014.sqlite'
conn = sqlite3.connect(sqlite_file)
# try querying
salary_query = "SELECT yearID, sum(salary) as total_payroll FROM Salaries WHERE lgID == 'AL' GROUP BY yearID"
team_salaries = pandas.read_sql(salary_query,conn)
team_salaries.head()
yearID | total_payroll | |
---|---|---|
0 | 1985 | 134401120.0 |
1 | 1986 | 157716444.0 |
2 | 1987 | 136088747.0 |
3 | 1988 | 157049812.0 |
4 | 1989 | 188771688.0 |
# use relational database to query data...use inner join
query = """
SELECT
t.teamID,
t.yearID,
SUM(s.salary) AS totalSalary,
t.W,
t.L,
t.W*100/162.0 AS winPercentage,
t.Rank,
t.G
FROM
Teams t
JOIN Salaries s
ON t.teamID==s.teamID AND t.yearID==s.yearID
GROUP BY
t.teamID,t.yearID
"""
data = pandas.read_sql(query,conn)
data
teamID | yearID | totalSalary | W | L | winPercentage | Rank | G | |
---|---|---|---|---|---|---|---|---|
0 | ANA | 1997 | 31135472.0 | 84 | 78 | 51.851852 | 2 | 162 |
1 | ANA | 1998 | 41281000.0 | 85 | 77 | 52.469136 | 2 | 162 |
2 | ANA | 1999 | 55388166.0 | 70 | 92 | 43.209877 | 4 | 162 |
3 | ANA | 2000 | 51464167.0 | 82 | 80 | 50.617284 | 3 | 162 |
4 | ANA | 2001 | 47535167.0 | 75 | 87 | 46.296296 | 3 | 162 |
... | ... | ... | ... | ... | ... | ... | ... | ... |
853 | WAS | 2010 | 61400000.0 | 69 | 93 | 42.592593 | 5 | 162 |
854 | WAS | 2011 | 63856928.0 | 80 | 81 | 49.382716 | 3 | 161 |
855 | WAS | 2012 | 80855143.0 | 98 | 64 | 60.493827 | 1 | 162 |
856 | WAS | 2013 | 113703270.0 | 86 | 76 | 53.086420 | 2 | 162 |
857 | WAS | 2014 | 131983680.0 | 96 | 66 | 59.259259 | 1 | 162 |
858 rows × 8 columns
The SQL query selects certain columns from the "teams" and "salaries" tables and performs an inner join to match up the teamIDs and the yearIDs. We utilize SUM and other operations to calculate win percentage and total salaries. The resulting table is stored in "data" as a Pandas dataframe. There will be missing data in the relation since there is no salary information in early years of baseball. Thus, the joined dataset will not include teams from early years. Since our main goal is to analyze the salaries of various teams and their win percentages, there is no point including the teams without salaries.
# Now plot payroll vs years for each team
# find unqiue team ids
teamIDs = data['teamID'].unique()
dataLength = data.shape[0]
numOfTeams = len(teamIDs)
for i in range(numOfTeams):
yData = []
xData = []
teamName = teamIDs[i]
for j in range(dataLength):
if (data.iat[j,0]==teamName):
xData.append(data.iat[j,1])
yData.append(data.iat[j,2])
plt.plot(xData,yData,label=teamName)
plt.title("Salaries of MLB Teams vs. Year")
plt.xlabel("Year")
plt.ylabel("Total Team Salary")
Text(0, 0.5, 'Total Team Salary')
The figure contains 30 line plots that correspond to team payroll on a given year. Based on the plot, I can state that team salary appears to generally increase as time goes on. Additionally, the spread of salaries has increased over time. However, this plot does not give any information on the relationship between increased spending and win percentage.
# Show the standard deviation of the payroll over time to confirm the increase in spread
years = data['yearID'].unique()
numOfYears = len(years)
sx = np.zeros(numOfYears)
xbar = np.zeros(numOfYears)
for i in range(numOfYears):
y = years[i]
sData = []
for j in range(dataLength):
if (data.iat[j,1]==y):
# elemts of sData contain the total salary for each team
sData.append(data.iat[j,2])
xbar[i] = sum(sData)/len(sData)
l = len(sData)
sx2 = 0
for xi in sData:
sx2 = sx2+(xi-xbar[i])**2
sx[i] = math.sqrt(sx2/l)
sortedIdx = np.argsort(years)
plt.plot(years[sortedIdx],sx[sortedIdx],label='std')
plt.title("Standard Deviation of Team Salaries per Year")
plt.xlabel("Year")
plt.ylabel("Standard Deviation in Salary")
Text(0, 0.5, 'Standard Deviation in Salary')
The figure confirms the increase in spread in expenditures by MLB teams each year. As mentioned before, it is easy to see that the spread of total salaries increases with the year.
To visualize this relation, we will split up the data into different 5 year time periods from 1989 to 2014.
# Discretize into time periods, and plot each scatter plot
discreteData = pandas.DataFrame(data)
intervals = [(1989,1994),(1994,1999),(1999,2004),(2004,2009),(2009,2014)]
bins = pandas.IntervalIndex.from_tuples(intervals)
discreteData['yearGroup'] = pandas.cut(discreteData['yearID'],bins)
# interval 1
for i in bins:
plt.figure()
for j in teamIDs:
payroll = 0
wp = 0
num = 0
for k in range(dataLength):
if ((discreteData.iat[k,8]==i) and (discreteData.iat[k,0]==j)):
payroll = payroll + discreteData.iat[k,2]
wp = wp + discreteData.iat[k,5]
num = num+1
if num!=0:
payroll = payroll/num
wp = wp/num
plt.scatter(payroll,wp)
if j!="OAK":
plt.text(payroll,wp,j)
plt.title("Winning Percentage vs. Payroll for MLB teams %s"%str(i))
plt.xlabel("Mean Payroll")
plt.ylabel("Mean Winning Percentage")
We can see that in the earlier years of baseball, especially from 1995-1999, an increase in payroll spending from teams is correlated to a higher winning percentage. In 2000-2004, this correlation decreases a bit, only to increase in the 2005-2009 seasons. In 2010-2014 this correlation drops a bit again. However, the variance of the payrolls over the years makes it hard to make direct comparisions
The Oakland Athletics had an impressive seasons from 2000-2004 as the team had top winning percentages but was well below the average payroll compared to teams that had similar winning percentages. The New York Yankees have shown to continuously spend the most on player salaries and have kept at the top in terms of winning percentages over the years. The Boston Red Sox have also shown to pay for win percentages.
# add column for standardized payroll
data.insert(9,'stdPayroll',['']*dataLength)
# calculate standardized payroll - we have previously calculated the standard deviation of payroll 'sx'
for i in range(numOfYears):
y = years[i]
for j in range(dataLength):
if (data.iat[j,1]==y):
stdp = (data.iat[j,2]-xbar[i])/sx[i] # standardization formula
data.iat[j,9] = stdp
Now, the 9th column of the table 'data' contains the standardized payrolls with a mean of 0 and standard deviation of 1. This will make it easier to make qualitative assessments of the scatter plots.
# repeat scatter plots with the standardized payrolls
# interval 1
for i in bins:
plt.figure()
for j in teamIDs:
payroll = 0
wp = 0
num = 0
for k in range(dataLength):
if ((discreteData.iat[k,8]==i) and (discreteData.iat[k,0]==j)):
payroll = payroll + discreteData.iat[k,9]
wp = wp + discreteData.iat[k,5]
num = num+1
if num!=0:
payroll = payroll/num
wp = wp/num
plt.scatter(payroll,wp)
if j!="OAK":
plt.text(payroll,wp,j)
plt.title("Winning Percentage vs. Payroll for MLB teams %s"%str(i))
plt.xlabel("Standardized Payroll")
plt.ylabel("Mean Winning Percentage")
plt.saveas("moneyball.png",dpi=)
Standardizing the mean payroll changes the x-values of the scatter points such that the mean is 0 and the standard deviation is 1. This helps us visualize the relationships between the two variables easily as it is no longer affected by different scales. As we can see, across all the time ranges there is a positive correlation between mean payroll and winning percentage. However, we can see a general trend that the strength of the correlation is decreasing.
# make a single scatter plot and use regression to plot line of best fit
xData = []
yData = []
for i in range(dataLength):
xData.append(data.iat[i,9])
yData.append(data.iat[i,5])
plt.scatter(xData,yData)
# use linear regression
coefficients = np.polyfit(xData,yData,1)
m,b = coefficients
plt.plot(xData,m*np.array(xData)+b,color='red')
plt.title("Winning Percentage vs. Payroll for MLB teams (1990-2014)")
plt.xlabel("Standardized Payroll")
plt.ylabel("Mean Winning Percentage")
Text(0, 0.5, 'Mean Winning Percentage')
The regression line supports the statement that the average payroll has a 50% win percentage. Further, the slope of the regression line suggests that for every 2 units of increase in standardized payroll, win percentage increases by 5%.
# add column of spending efficiency to data
data.insert(10,'spendingEfficiency',['']*dataLength)
for i in range(dataLength):
stdp = data.iat[i,9]
expectedWP = 50+2.5*stdp
data.iat[i,10] = data.iat[i,5]-expectedWP # calculate spending efficiency
# make line plots of OAK,BOS,NYA,ATL,TBA
tid = 'OAK'
teams = ['OAK','BOS','NYA','ATL','TBA']
for tid in teams:
#plt.figure()
xData = []
yData = []
for i in range(dataLength):
if (data.iat[i,0]==tid):
xData.append(data.iat[i,1])
yData.append(data.iat[i,10])
plt.plot(xData,yData,label=tid)
plt.title("Efficiency of MLB Teams")
plt.xlabel("Year")
plt.ylabel("Spending Efficiency")
plt.legend()
These plots of spending efficiency allow us to easily see trends in the relationship between team spending and win percentage. By combining payroll and win percentage into a single term, we are able to simply graph a 2D plot with more resolution on the time axis (compared to 5 periods). We are able to closely examine inidividual teams using this method. Before, we could only analyze general trends.