import pandas as pd
import numpy as np
import networkx as nx
from matplotlib import pyplot as plt
from matplotlib.colors import LinearSegmentedColormap
import math
import pickle
import folium
import copy

/usr/lib/python3/dist-packages/requests/__init__.py:89: RequestsDependencyWarning: urllib3 (2.0.3) or chardet (3.0.4) doesn't match a supported version!
  warnings.warn("urllib3 ({}) or chardet ({}) doesn't match a supported "


flight_data = pd.read_csv("T_DB1B_MARKET_2023.csv")
location_data = pd.read_csv("GlobalAirportDatabase.txt",sep=':',header=None)
location_data.columns = ['ICAO code','IATA code',
                         'Airport Name','City','Country','Latitude Degrees',
                         'Latitude Minutes','Latitude Seconds',
                         'Latitude Direction','Longitude Degrees',
                         'Longitude Minutes','Longitude Seconds',
                         'Longitude Direction','Altitude',
                         'Latitude Decimal','Longitude Decimal']


deleteIdx = pd.Series(flight_data['MARKET_FARE']<50)
flight_data = flight_data[~deleteIdx]


flight_data


location_data


flight_data = flight_data.groupby(['ORIGIN', 'DEST']).agg({'MARKET_FARE': 'mean'}).reset_index()
m,n = flight_data.shape
flight_data.columns = ['IATA code origin', 'IATA code destination','MARKET_FARE']


flight_data


location_data = location_data[location_data['Country']=='USA'].reset_index()
# now remove any rows with missing latitude/longitude
location_data = location_data[location_data['Latitude Direction']!='U']
# we only care about the decimal latitude and longitude, so remove unecessary columns
location_data.drop(columns=['ICAO code','Airport Name','City','Country',
                            'index','Latitude Degrees','Latitude Minutes',
                            'Latitude Seconds','Latitude Direction',
                            'Longitude Degrees','Longitude Minutes',
                            'Longitude Seconds','Longitude Direction',
                            'Altitude'],inplace=True)


location_data


originLocation = location_data.copy(deep=True)
destLocation = location_data.copy(deep=True)
originLocation.columns = ['IATA code origin','Latitude Origin','Longitude Origin']
destLocation.columns = ['IATA code destination','Latitude Dest','Longitude Dest']
flight_data = flight_data.merge(originLocation,how='inner',on='IATA code origin')
flight_data = flight_data.merge(destLocation,how='inner',on='IATA code destination')


flight_data


plt.figure()
bins = 10
plt.hist(flight_data['MARKET_FARE'])
plt.title('Air Cost Distribution')
plt.xlabel('Fare ($)')
plt.ylabel('Count')
plt.show()


plt.figure()
bins = 80
plt.hist(np.log2(flight_data["MARKET_FARE"]),bins)
plt.title('Air Cost Distribution')
plt.xlabel('Transformed Fare')
plt.ylabel('Count')
plt.show()


median = flight_data.median(axis=0)["MARKET_FARE"]
median

281.05125


airport_map = folium.Map(location=[48.5,-110],zoom_start=3.2)
for i in range(originLocation.shape[0]):
    folium.CircleMarker(
        location=[originLocation.iloc[i]['Latitude Origin'], originLocation.iloc[i]['Longitude Origin']],
        radius = 3,
        fill = True,
        fill_opacity = 0.3,
        color = '1001',#'red',
        opacity = 0.4,
        bubling_mouse_events = True,
        popup = [originLocation.iloc[i]['IATA code origin']]
    ).add_to(airport_map)
airport_map


ATL = flight_data[flight_data['IATA code origin']=='ATL']


for i in range(ATL.shape[0]):
    folium.PolyLine([(ATL.iloc[i]['Latitude Origin'],ATL.iloc[i]['Longitude Origin']),(ATL.iloc[i]['Latitude Dest'],ATL.iloc[i]['Longitude Dest'])],
    opacity = 0.2,                    
    ).add_to(airport_map)
airport_map


G = nx.DiGraph()
m,n = flight_data.shape
orig = flight_data['IATA code origin']
dest = flight_data['IATA code destination']
weight = flight_data['MARKET_FARE']
for i in range(m):
    G.add_edge(orig[i],dest[i],weight=weight[i])


# get in-degree
indegreeCentrality = nx.in_degree_centrality(G)
indegreeCentrality = sorted(indegreeCentrality.items(), key=lambda x:x[1],reverse=True)
centrality = pd.DataFrame(indegreeCentrality)
centrality.columns = ['IATA code','in-degree']
# get out-degree
outdegreeCentrality = nx.out_degree_centrality(G)
outdegreeCentrality = sorted(outdegreeCentrality.items(), key=lambda x:x[1],reverse=True)
outdegreeCentrality = pd.DataFrame(outdegreeCentrality)
outdegreeCentrality.columns = ['IATA code','out-degree']
# merge results
centrality = centrality.merge(outdegreeCentrality,how='left',on='IATA code')
centrality


plt.scatter(centrality['in-degree'],centrality['out-degree'])
plt.title("In-degree vs. Out-degree Centrality for US Airports")
plt.xlabel("In-degree Score");
plt.ylabel("Out-degree Score");


inboundCloseness = nx.closeness_centrality(G,distance='weight')
inboundCloseness = sorted(inboundCloseness.items(), key=lambda x:x[1],reverse=True)
inboundCloseness = pd.DataFrame(inboundCloseness)
inboundCloseness.columns = ['IATA code','inbound closeness']


outboundCloseness = nx.closeness_centrality(G.reverse(),distance='weight')
outboundCloseness = sorted(outboundCloseness.items(), key=lambda x:x[1],reverse=True)
outboundCloseness = pd.DataFrame(outboundCloseness)
outboundCloseness.columns = ['IATA code','outbound closeness']


centrality = centrality.merge(inboundCloseness,how='inner',on='IATA code')
centrality = centrality.merge(outboundCloseness,how='inner',on='IATA code')
centrality


plt.scatter(centrality['inbound closeness'],centrality['outbound closeness'])
plt.xlim((0,0.004))
plt.ylim((0,0.004))
plt.title("Closeness Centrality for US Airports")
plt.xlabel("Inbound Score");
plt.ylabel("Outbound Score");


eigCent = nx.eigenvector_centrality(G)
eigCent = sorted(eigCent.items(), key=lambda x:x[1],reverse=True)
eigCent = pd.DataFrame(eigCent)
eigCent.columns = ['IATA code','eig centrality']


centrality = centrality.merge(eigCent,how="inner",on="IATA code")
centrality


class PriorityQueue:
    def __init__(self):
        self.q = []
    def isEmpty(self):
        if len(self.q)==0:
            return True
        else:
            return False   
    def enqueue(self,obj):
        self.q.append(obj)  
    def dequeue(self):
        # O(n) complexity
        assert not self.isEmpty()
        currentMin = 0
        for i in range(len(self.q)):
            x = self.q[i]
            y = self.q[currentMin]
            if x[0] < y[0]:
                currentMin = i
        Min = self.q[currentMin]
        del self.q[currentMin]
        return Min  
    def reassignDistance(self,dist,adjacentNode,currentPath):
        # dist is = to dist(v)+weight(u,v)
        # O(n) complexity
        queueContents = self.q
        cp = copy.deepcopy(currentPath)
        for i in range(len(queueContents)):
            if queueContents[i][1]==adjacentNode:
                if dist<=queueContents[i][0]:
                    # reassign shorter distance if here
                    queueContents[i][0]=dist
                    # add the neighbor airport to the end of the currentPath
                    currentAirport = queueContents[i][1]
                    cp.append(currentAirport)
                    # update the current path
                    queueContents[i][2]= cp
                break


def DijkstraShortestPath(G,starting_airport):
    # intialize priority queue
    P = PriorityQueue()
    # initalize list to store shortest path information
    Results = []
    # get all nodes from graph
    all_nodes = list(G.nodes())
    # initialize priority queue. starting airport will have a cost of zero. all other airports will have cost = Inf
    for i in all_nodes:
        if i == starting_airport:
            x = [0,i,[]]
            P.enqueue(x)
        else:
            x = [math.inf,i,[]]
            P.enqueue(x)
    # now the process is to dequeue nodes from P and find the distances to neighboring nodes.
    # nodes are deqeued from P with lowest cost as priority.
    while P.isEmpty()==False:
        # dequeue from the priority queue and get current node
        v = P.dequeue()
        currentNode = v[1]
        # if the current node does nothave any outbound flights, fully dequeue and return
        if G.out_degree(currentNode)==0:
            while P.isEmpty()==False:
                Results = Results + [v]
                v = P.dequeue()
            break
        # get all neighboring airports from graph that has currentNode (airport name) as its key
        connectedNodes = G.neighbors(currentNode)
        # iterate through the neighbor airports and compute the costs
        for i in connectedNodes:
            # reassign cost if a cheaper way to the current neighbor is found
            # there is a new minimum cost for currentAdjacentNode "u" if: currentCost(v)+pathCost(v to u)<currentCost(u)
            newCost = v[0] + G[currentNode][i]['weight']
            # feed the new cost, the neighbor node and the current path to v below
            P.reassignDistance(newCost,i,v[2])
            # P.reassignDistance updates the costs to various nodes if needed in the priority queue
        # store results    
        Results = Results + [v]
    # save results
    filehandler = open("PathResults.npy", 'wb')
    pickle.dump(Results, filehandler)


starting_airport = 'LAX'
DijkstraShortestPath(G,starting_airport)


file_g = open('PathResults.npy','rb')
cheapest_paths = pickle.load(file_g)
costs = []
dests = []
for i in range(len(cheapest_paths)):
    costs.append(cheapest_paths[i][0])
    dests.append(cheapest_paths[i][1])
layover_df = pd.DataFrame(costs,columns=['Costs'])
layover_df['IATA code'] = dests
layover_df = layover_df.merge(location_data,how='left',on='IATA code')
# remove rows that have infinite cost (can't get there)
infIdx = layover_df["Costs"]==math.inf
layover_df = layover_df[~infIdx]


# Define the color points and corresponding colors
color_points = [0, 0.33, 0.66, 1]
colors = ['#00FF00', '#FFFF00', '#FFA500', '#FF0000']  # Green, Yellow, Orange, Red
cmap = LinearSegmentedColormap.from_list('custom_cmap', list(zip(color_points, colors)))
# Create an array with interpolated colors
N = 1001 
color_array = cmap(np.linspace(0, 1, N))


def rgba_to_hex(rgba_tuple):
    r, g, b, a = rgba_tuple
    return "#{:02X}{:02X}{:02X}".format(int(r * 255), int(g * 255), int(b * 255))

# Convert the RGBA values to hex color codes
hex_colors = [rgba_to_hex(rgba) for rgba in color_array]


# get flight cost data for direct flights (no Dijkstra's algorithm) from starting airport
no_layover = flight_data[flight_data['IATA code origin']==starting_airport]
# initialize folium maps
new_map = folium.Map(location=[48.5,-110],zoom_start=3.2)
old_map = folium.Map(location=[48.5,-110],zoom_start=3.2)
# normalize all costs
no_layover_cost = np.array(no_layover['MARKET_FARE'])
layover_cost = np.array(layover_df['Costs'])
layover_cost = layover_cost[1:]
if min(no_layover_cost)<min(layover_cost):
    leastCost = min(no_layover_cost)
else:
    leastCost = min(layover_cost)
no_layover_cost = no_layover_cost - leastCost
layover_cost = layover_cost - leastCost
if max(no_layover_cost)>max(layover_cost):
    maxCost = max(no_layover_cost)
else:
    maxCost = max(layover_cost)
no_layover_cost = no_layover_cost/maxCost
layover_cost = layover_cost/maxCost
# since costs are skewed, perform logarithmic transformation
layover_cost = np.log(1+layover_cost)
no_layover_cost = np.log(1+no_layover_cost)
# normalize data again to match with color_array
if min(no_layover_cost)<min(layover_cost):
    leastCost = min(no_layover_cost)
else:
    leastCost = min(layover_cost)
no_layover_cost = no_layover_cost - leastCost
layover_cost = layover_cost - leastCost
if max(no_layover_cost)>max(layover_cost):
    maxCost = max(no_layover_cost)
else:
    maxCost = max(layover_cost)
no_layover_cost = no_layover_cost/maxCost
layover_cost = layover_cost/maxCost


# add to layover map
for i in range(layover_df.shape[0]):
    if i==0:
        continue
    cost = layover_cost[i-1]
    color_idx = int(math.floor(cost*(len(color_array)-1)))
    folium.CircleMarker(
        location=[layover_df.iloc[i]['Latitude Decimal'], layover_df.iloc[i]['Longitude Decimal']],
        radius = 2,
        fill = True,
        fill_opacity = 1,
        color = hex_colors[color_idx],
        opacity = 1,
        bubling_mouse_events = True,
        popup = [layover_df.iloc[i]['IATA code'],layover_df.iloc[i]['Costs']]
    ).add_to(new_map)
# add origin airport as big black point
folium.CircleMarker(
        location=[layover_df.iloc[0]['Latitude Decimal'], layover_df.iloc[0]['Longitude Decimal']],
        radius = 6,
        fill = True,
        fill_opacity = 1,
        color = 'black',
        opacity = 1,
        bubling_mouse_events = True,
        popup = [layover_df.iloc[0]['IATA code']]
    ).add_to(new_map)

# add to no layover map (direct flights)
for i in range(no_layover.shape[0]):
    cost = no_layover_cost[i]
    color_idx = int(round(cost*(len(color_array)-1)))
    folium.CircleMarker(
        location=[no_layover.iloc[i]['Latitude Dest'], no_layover.iloc[i]['Longitude Dest']],
        radius = 2,
        fill = True,
        fill_opacity = 1,
        color = hex_colors[color_idx],
        opacity = 1,
        bubling_mouse_events = True,
        popup = [no_layover.iloc[i]['IATA code destination'],no_layover.iloc[i]['MARKET_FARE']]
    ).add_to(old_map)
folium.CircleMarker(
        location=[no_layover.iloc[0]['Latitude Origin'], no_layover.iloc[0]['Longitude Origin']],
        radius = 6,
        fill = True,
        fill_opacity = 1,
        color = 'black',
        opacity = 1,
        bubling_mouse_events = True,
        popup = [no_layover.iloc[0]['IATA code origin']]
    ).add_to(old_map);


new_map


old_map

	ORIGIN_AIRPORT_ID	ORIGIN_CITY_MARKET_ID	ORIGIN	DEST_AIRPORT_ID	DEST_CITY_MARKET_ID	DEST	PASSENGERS	MARKET_FARE
0	10135	30135	ABE	10136	30136	ABI	1.0	397.70
1	10135	30135	ABE	10136	30136	ABI	1.0	583.00
3	10135	30135	ABE	10140	30140	ABQ	1.0	151.83
4	10135	30135	ABE	10140	30140	ABQ	1.0	158.00
5	10135	30135	ABE	10140	30140	ABQ	1.0	223.50
...	...	...	...	...	...	...	...	...
6078689	16869	32389	XWA	15412	35412	TYS	1.0	615.40
6078690	16869	32389	XWA	15412	35412	TYS	1.0	657.00
6078691	16869	32389	XWA	15919	31834	XNA	1.0	184.00
6078692	16869	32389	XWA	15919	31834	XNA	1.0	284.50
6078693	16869	32389	XWA	15919	31834	XNA	1.0	326.50

	ICAO code	IATA code	Airport Name	City	Country	Latitude Degrees	Latitude Minutes	Latitude Seconds	Latitude Direction	Longitude Degrees	Longitude Minutes	Longitude Seconds	Longitude Direction	Altitude	Latitude Decimal	Longitude Decimal
0	AYGA	GKA	GOROKA	GOROKA	PAPUA NEW GUINEA	6	4	54	S	145	23	30	E	1610	-6.082	145.392
1	AYLA	LAE	NaN	LAE	PAPUA NEW GUINEA	0	0	0	U	0	0	0	U	0	0.000	0.000
2	AYMD	MAG	MADANG	MADANG	PAPUA NEW GUINEA	5	12	25	S	145	47	19	E	7	-5.207	145.789
3	AYMH	HGU	MOUNT HAGEN	MOUNT HAGEN	PAPUA NEW GUINEA	5	49	34	S	144	17	46	E	1643	-5.826	144.296
4	AYNZ	LAE	NADZAB	NADZAB	PAPUA NEW GUINEA	6	34	11	S	146	43	34	E	73	-6.570	146.726
...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...	...
9295	ZYTK	NaN	NaN	SHENYANG	CHINA	0	0	0	U	0	0	0	U	0	0.000	0.000
9296	ZYTL	DLC	ZHOUSHUIZI	DALIAN	CHINA	38	57	56	N	121	32	18	E	33	38.966	121.538
9297	ZYXC	NaN	NaN	XIANCHENG	CHINA	0	0	0	U	0	0	0	U	0	0.000	0.000
9298	ZYYC	NaN	NaN	YICHUN	CHINA	0	0	0	U	0	0	0	U	0	0.000	0.000
9299	ZYYJ	NaN	YANJI	YANJI	CHINA	42	52	54	N	129	26	54	E	191	42.882	129.448

	IATA code origin	IATA code destination	MARKET_FARE
0	ABE	ABI	490.350000
1	ABE	ABQ	360.833000
2	ABE	ABY	204.230000
3	ABE	AEX	215.100000
4	ABE	AGS	257.102727
...	...	...	...
64505	YUM	TVC	549.927500
64506	YUM	TYS	403.347500
64507	YUM	VPS	437.106667
64508	YUM	XNA	348.594000
64509	YUM	YKM	233.000000

	IATA code	Latitude Decimal	Longitude Decimal
0	ABI	32.411	-99.682
2	ACK	41.253	-70.060
3	ACT	31.611	-97.230
4	ACY	39.458	-74.577
5	ADM	34.303	-97.019
...	...	...	...
544	BSF	19.760	-155.554
545	ITO	19.720	-155.049
546	UPP	20.265	-155.860
550	OLI	70.499	-149.879
551	PIZ	69.733	-163.005

	IATA code origin	IATA code destination	MARKET_FARE	Latitude Origin	Longitude Origin	Latitude Dest	Longitude Dest
0	ABI	ABQ	225.416667	32.411	-99.682	35.040	-106.609
1	ACT	ABQ	396.800000	31.611	-97.230	35.040	-106.609
2	ADQ	ABQ	360.800000	57.750	-152.494	35.040	-106.609
3	AEX	ABQ	296.555556	31.327	-92.548	35.040	-106.609
4	AGS	ABQ	277.019231	33.370	-81.964	35.040	-106.609
...	...	...	...	...	...	...	...
25130	LAS	SCK	101.682844	36.080	-115.152	37.894	-121.239
25131	PHX	SCK	87.290323	33.434	-112.008	37.894	-121.239
25132	MCO	ILG	92.186441	28.429	-81.316	39.679	-75.606
25133	ORD	CDB	1134.000000	41.979	-87.904	55.206	-162.724
25134	PIE	IAG	124.482759	27.911	-82.687	43.107	-78.946

¶

Introduction ¶

Imports¶

Data Collection ¶

Data Tidying ¶

Data Visualization ¶

Model: Analysis and Testing ¶

Degree Centrality¶

Closeness Centrality¶

Eigenvector Centrality¶

Part 2: Dijkstra's Shortest Path Algorithm ¶

Implementation¶

Testing¶

Interpretation and Discussion ¶

Final Thoughts ¶

	IATA code	in-degree	out-degree
0	SEA	0.889908	0.885321
1	LAS	0.876147	0.876147
2	PHX	0.871560	0.885321
3	LAX	0.866972	0.866972
4	ORD	0.866972	0.862385
...	...	...	...
214	MUE	0.004587	0.004587
215	SNP	0.004587	0.009174
216	CDB	0.004587	0.004587
217	IAG	0.004587	0.004587
218	MCN	0.000000	0.004587

	IATA code	in-degree	out-degree	inbound closeness	outbound closeness
0	SEA	0.889908	0.885321	0.003223	0.003217
1	LAS	0.876147	0.876147	0.003625	0.003526
2	PHX	0.871560	0.885321	0.003236	0.003265
3	LAX	0.866972	0.866972	0.003287	0.003236
4	ORD	0.866972	0.862385	0.003523	0.003619
...	...	...	...	...	...
214	MUE	0.004587	0.004587	0.001472	0.001452
215	SNP	0.004587	0.009174	0.000671	0.000775
216	CDB	0.004587	0.004587	0.000708	0.000751
217	IAG	0.004587	0.004587	0.002365	0.002359
218	MCN	0.000000	0.004587	0.000000	0.002267