How to Pivot and Plot Data With Pandas

>>> import pandas as pd>>> df = pd.read_csv('865214564_T_T100_MARKET_ALL_CARRIER.zip')
>>> df.shape
(321409, 41)
>>> df.columns
Index(['PASSENGERS', 'FREIGHT', 'MAIL', 'DISTANCE', 'UNIQUE_CARRIER',
'AIRLINE_ID', 'UNIQUE_CARRIER_NAME', 'UNIQUE_CARRIER_ENTITY',
'REGION', 'CARRIER', 'CARRIER_NAME', 'CARRIER_GROUP',
'CARRIER_GROUP_NEW', 'ORIGIN_AIRPORT_ID',
'ORIGIN_AIRPORT_SEQ_ID', 'ORIGIN_CITY_MARKET_ID', 'ORIGIN',
'ORIGIN_CITY_NAME', 'ORIGIN_STATE_ABR', 'ORIGIN_STATE_FIPS',
'ORIGIN_STATE_NM', 'ORIGIN_COUNTRY', 'ORIGIN_COUNTRY_NAME',
'ORIGIN_WAC', 'DEST_AIRPORT_ID', 'DEST_AIRPORT_SEQ_ID',
'DEST_CITY_MARKET_ID', 'DEST', 'DEST_CITY_NAME', 'DEST_STATE_ABR',
'DEST_STATE_FIPS', 'DEST_STATE_NM', 'DEST_COUNTRY',
'DEST_COUNTRY_NAME', 'DEST_WAC', 'YEAR', 'QUARTER', 'MONTH',
'DISTANCE_GROUP', 'CLASS', 'DATA_SOURCE'],
dtype='object')
>>> df = df.rename(lambda x: x.lower(), axis=1)
>>> df.head()
# download flight class meanings at
# https://www.transtats.bts.gov/Download_Lookup.asp?Y11x72=Y_fReiVPR_PYNff
>>> top_airlines = df.query('`class` == "F"')\
... .groupby('unique_carrier_name').passengers.sum()\
... .nlargest(5)
>>> top_airlines
unique_carrier_name
Southwest Airlines Co. 162681011.0
Delta Air Lines Inc. 162260114.0
American Airlines Inc. 155782611.0
United Air Lines Inc. 116212143.0
JetBlue Airways 42830602.0
Name: passengers, dtype: float64
>>> df.loc[
... df.unique_carrier_name.isin(top_airlines.index), 'class'
... ].value_counts()
F 97293
L 3994
Name: class, dtype: int64
  • Unique values in the dest_city_name column should be used as our row labels (the index argument)
  • Unique values in the unique_carrier_name column should be used as our column labels (the columns argument)
  • The values used for the aggregation should come from the passengers column (the values argument), and they should be summed (the aggfunc argument)
  • Row/column subtotals should be calculated (the margins argument)
>>> pivot = df.query('`class` == "F"').pivot_table(
... index='dest_city_name',
... columns='unique_carrier_name',
... values='passengers',
... aggfunc='sum',
... margins=True
... ).sort_values('All', ascending=False)
>>> pivot.head(10)
>>> pivot = pivot.drop('All').head(10)
>>> pivot[top_airlines.sort_index().index]
>>> from matplotlib import ticker>>> ax = pivot[top_airlines.sort_index().index].plot(
... kind='barh', stacked=True,
... title='2019 Passenger Totals\n(source: BTS)'
... )
>>> ax.invert_yaxis() # put destinations with more passengers on top
# formatting
>>> ax.set(xlabel='number of passengers', ylabel='destination')
>>> ax.legend(title='carrier')
# shows x-axis in millions instead of scientific notation
>>> ax.xaxis.set_major_formatter(ticker.EngFormatter())
# removes the top and right lines from the figure to make it less boxy
>>> for spine in ['top', 'right']:
... ax.spines[spine].set_visible(False)
>>> ax = pivot[top_airlines.sort_index().index].assign(
... Other=lambda x: pivot.All - x.sum(axis=1)
... ).plot(
... kind='barh', stacked=True,
... title='2019 Passenger Totals\n(source: BTS)'
... )
>>> ax.invert_yaxis()
# formatting
>>> ax.set(xlabel='number of passengers', ylabel='destination')
>>> ax.xaxis.set_major_formatter(ticker.EngFormatter())
# shift legend to not cover the bars
>>> ax.legend(title='carrier', bbox_to_anchor=(0.7, 0), loc='lower left')
>>> for spine in ['top', 'right']:
... ax.spines[spine].set_visible(False)
>>> normalized_pivot = \
... pivot[top_airlines.sort_index().index].apply(lambda x: x / pivot.All)
>>> normalized_pivot
# determine sort order
>>> market_share_sorted = normalized_pivot.sum(axis=1).sort_values()
>>> ax = normalized_pivot.loc[market_share_sorted.index,:].plot(
... kind='barh', stacked=True, xlim=(0, 1),
... title='2019 Market Share\n(source: BTS)'
... )
# formatting
>>> ax.set(xlabel='percentage of all passengers', ylabel='destination')
>>> ax.legend(title='carrier', bbox_to_anchor=(0.7, 0), loc='lower left')
# show x-axis as percentages
>>> ax.xaxis.set_major_formatter(ticker.PercentFormatter(xmax=1))
>>> for spine in ['top', 'right']:
... ax.spines[spine].set_visible(False)
>>> pivot.loc['Seattle, WA', :].nlargest(6)
unique_carrier_name
All 25084302.0
Alaska Airlines Inc. 9637977.0
Delta Air Lines Inc. 4906617.0
Horizon Air 2454491.0
Southwest Airlines Co. 1446404.0
United Air Lines Inc. 1383381.0
Name: Seattle, WA, dtype: float64

About the author/ODSC Europe 2021 Speaker on Pandas:

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
ODSC - Open Data Science

ODSC - Open Data Science

94K Followers

Our passion is bringing thousands of the best and brightest data scientists together under one roof for an incredible learning and networking experience.