import sqlite3
import pandas
from matplotlib import pyplot as plt
import math
import numpy as np


# connect to database
sqlite_file = 'lahman2014.sqlite'
conn = sqlite3.connect(sqlite_file)


# try querying
salary_query = "SELECT yearID, sum(salary) as total_payroll FROM Salaries WHERE lgID == 'AL' GROUP BY yearID"
team_salaries = pandas.read_sql(salary_query,conn)
team_salaries.head()


# use relational database to query data...use inner join
query = """
SELECT
    t.teamID,
    t.yearID,
    SUM(s.salary) AS totalSalary,
    t.W,
    t.L,
    t.W*100/162.0 AS winPercentage,
    t.Rank,
    t.G
FROM
    Teams t
JOIN Salaries s
    ON t.teamID==s.teamID AND t.yearID==s.yearID
GROUP BY
    t.teamID,t.yearID
"""
data = pandas.read_sql(query,conn)


data


# Now plot payroll vs years for each team
# find unqiue team ids
teamIDs = data['teamID'].unique()
dataLength = data.shape[0]
numOfTeams = len(teamIDs)
for i in range(numOfTeams):
    yData = []
    xData = []
    teamName = teamIDs[i]
    for j in range(dataLength):
        if (data.iat[j,0]==teamName):
            xData.append(data.iat[j,1])
            yData.append(data.iat[j,2])
    plt.plot(xData,yData,label=teamName)
plt.title("Salaries of MLB Teams vs. Year")
plt.xlabel("Year")
plt.ylabel("Total Team Salary")

Text(0, 0.5, 'Total Team Salary')


# Show the standard deviation of the payroll over time to confirm the increase in spread
years = data['yearID'].unique()
numOfYears = len(years)
sx = np.zeros(numOfYears)
xbar = np.zeros(numOfYears)
for i in range(numOfYears):
    y = years[i]
    sData = []
    for j in range(dataLength):
        if (data.iat[j,1]==y):
            # elemts of sData contain the total salary for each team
            sData.append(data.iat[j,2])
    xbar[i] = sum(sData)/len(sData)
    l = len(sData)
    sx2 = 0
    for xi in sData:
        sx2 = sx2+(xi-xbar[i])**2
    sx[i] = math.sqrt(sx2/l)


sortedIdx = np.argsort(years)
plt.plot(years[sortedIdx],sx[sortedIdx],label='std')
plt.title("Standard Deviation of Team Salaries per Year")
plt.xlabel("Year")
plt.ylabel("Standard Deviation in Salary")

Text(0, 0.5, 'Standard Deviation in Salary')


# Discretize into time periods, and plot each scatter plot
discreteData = pandas.DataFrame(data)
intervals = [(1989,1994),(1994,1999),(1999,2004),(2004,2009),(2009,2014)]
bins = pandas.IntervalIndex.from_tuples(intervals)
discreteData['yearGroup'] = pandas.cut(discreteData['yearID'],bins)


# interval 1
for i in bins:
    plt.figure()
    for j in teamIDs:
        payroll = 0
        wp = 0
        num = 0
        for k in range(dataLength):
            if ((discreteData.iat[k,8]==i) and (discreteData.iat[k,0]==j)):
                payroll = payroll + discreteData.iat[k,2]
                wp = wp + discreteData.iat[k,5]
                num = num+1
        if num!=0:
            payroll = payroll/num
            wp = wp/num
            plt.scatter(payroll,wp)
            if j!="OAK":
                plt.text(payroll,wp,j)
    plt.title("Winning Percentage vs. Payroll for MLB teams %s"%str(i))
    plt.xlabel("Mean Payroll")
    plt.ylabel("Mean Winning Percentage")


# add column for standardized payroll
data.insert(9,'stdPayroll',['']*dataLength)


# calculate standardized payroll - we have previously calculated the standard deviation of payroll 'sx'
for i in range(numOfYears):
    y = years[i]
    for j in range(dataLength):
        if (data.iat[j,1]==y):
            stdp = (data.iat[j,2]-xbar[i])/sx[i] # standardization formula
            data.iat[j,9] = stdp


# repeat scatter plots with the standardized payrolls

# interval 1
for i in bins:
    plt.figure()
    for j in teamIDs:
        payroll = 0
        wp = 0
        num = 0
        for k in range(dataLength):
            if ((discreteData.iat[k,8]==i) and (discreteData.iat[k,0]==j)):
                payroll = payroll + discreteData.iat[k,9]
                wp = wp + discreteData.iat[k,5]
                num = num+1
        if num!=0:
            payroll = payroll/num
            wp = wp/num
            plt.scatter(payroll,wp)
            if j!="OAK":
                plt.text(payroll,wp,j)
    plt.title("Winning Percentage vs. Payroll for MLB teams %s"%str(i))
    plt.xlabel("Standardized Payroll")
    plt.ylabel("Mean Winning Percentage")
    plt.saveas("moneyball.png",dpi=)


# make a single scatter plot and use regression to plot line of best fit
xData = []
yData = []
for i in range(dataLength):
    xData.append(data.iat[i,9])
    yData.append(data.iat[i,5])
plt.scatter(xData,yData)
# use linear regression
coefficients = np.polyfit(xData,yData,1)
m,b = coefficients
plt.plot(xData,m*np.array(xData)+b,color='red')
plt.title("Winning Percentage vs. Payroll for MLB teams (1990-2014)")
plt.xlabel("Standardized Payroll")
plt.ylabel("Mean Winning Percentage")

Text(0, 0.5, 'Mean Winning Percentage')


# add column of spending efficiency to data
data.insert(10,'spendingEfficiency',['']*dataLength)


for i in range(dataLength):
    stdp = data.iat[i,9]
    expectedWP = 50+2.5*stdp
    data.iat[i,10] = data.iat[i,5]-expectedWP # calculate spending efficiency


# make line plots of OAK,BOS,NYA,ATL,TBA

tid = 'OAK'
teams = ['OAK','BOS','NYA','ATL','TBA']
for tid in teams:
    #plt.figure()
    xData = []
    yData = []
    for i in range(dataLength):
        if (data.iat[i,0]==tid):
            xData.append(data.iat[i,1])
            yData.append(data.iat[i,10])
    plt.plot(xData,yData,label=tid)
    plt.title("Efficiency of MLB Teams")
    plt.xlabel("Year")
    plt.ylabel("Spending Efficiency")
    plt.legend()

	yearID	total_payroll
0	1985	134401120.0
1	1986	157716444.0
2	1987	136088747.0
3	1988	157049812.0
4	1989	188771688.0

	teamID	yearID	totalSalary	W	L	winPercentage	Rank	G
0	ANA	1997	31135472.0	84	78	51.851852	2	162
1	ANA	1998	41281000.0	85	77	52.469136	2	162
2	ANA	1999	55388166.0	70	92	43.209877	4	162
3	ANA	2000	51464167.0	82	80	50.617284	3	162
4	ANA	2001	47535167.0	75	87	46.296296	3	162
...	...	...	...	...	...	...	...	...
853	WAS	2010	61400000.0	69	93	42.592593	5	162
854	WAS	2011	63856928.0	80	81	49.382716	3	161
855	WAS	2012	80855143.0	98	64	60.493827	1	162
856	WAS	2013	113703270.0	86	76	53.086420	2	162
857	WAS	2014	131983680.0	96	66	59.259259	1	162

Moneyball: An analysis of the relation between expenditures and success in Major League Baseball

Import packages¶

Load data and extract using SQL¶

Visualize Team Expenditure¶

Visualize expenditure vs. winning percentage¶

Convert to standardized payrolls¶

Fit linear model to data¶

Compute spending efficiency¶