Frequencies and Chaining in Python-Pandas

import sysfunctdir = "c:/steve/jupyter/notebooks/functions"
sys.path.append(functdir)
from pythonfunctions import *blanks(2)
import warnings, os, numpy as np, pandas as pd, time, datetime, re, string, random, math, feather
from collections import Counter
from functools import reduce
warnings.filterwarnings('ignore')blanks(2)
start = time.time()wdir = "c:/bigdata/raw/chicago"
os.chdir(wdir)
ifile = "odsccrimeplus.feather"
chicagocrime = feather.read_dataframe(ifile).reset_index(drop=True)
chicagocrime.eval("""
year = date.dt.year
month = date.dt.month
day = date.dt.day
""",inplace=True)
metadf(chicagocrime)end = time.time()
print(end - start)
blanks(2)<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7092612 entries, 0 to 7092611
Data columns (total 27 columns):
id int32
casenumber object
date datetime64[ns, UTC]
block object
iucr object
primarytype object
description object
locationdescription object
arrest bool
domestic bool
beat int32
district float64
ward float64
communityarea float64
fbicode object
xcoordinate float64
ycoordinate float64
year int64
updatedon datetime64[ns, UTC]
latitude float64
longitude float64
fbicodedesc object
name object
population float64
month int64
day int64
hour int32
dtypes: bool(2), datetime64[ns, UTC](2), float64(8), int32(3), int64(3), object(9)
memory usage: 1.3+ GB
None
8.357961177825928
start = time.time()chicagocrime = mkdowncast(chicagocrime)metadf(chicagocrime)
blanks(1)
end = time.time()
print(end - start)
blanks(2)<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7092612 entries, 0 to 7092611
Data columns (total 27 columns):
id int32
casenumber object
date datetime64[ns, UTC]
block object
iucr object
primarytype object
description object
locationdescription object
arrest bool
domestic bool
beat int16
district float32
ward float32
communityarea float32
fbicode object
xcoordinate float32
ycoordinate float32
year int16
updatedon datetime64[ns, UTC]
latitude float32
longitude float32
fbicodedesc object
name object
population float32
month int8
day int8
hour int8
dtypes: bool(2), datetime64[ns, UTC](2), float32(8), int16(2), int32(1), int8(3), object(9)
memory usage: 899.6+ MB
None
2.7455520629882812
freqs = pd.DataFrame(chicagocrime['name'].value_counts())
print(freqs.name.sum())
print(freqs)
blanks(1)freqs = pd.DataFrame(chicagocrime['name'].value_counts(dropna=False))
print(freqs.name.sum())
print(freqs)
blanks(2)6479024
name
Austin 410333
Near North Side 225363
South Shore 211725
Humboldt Park 204517
Near West Side 192609
... ...
Oakland 14323
Hegewisch 14008
Forest Glen 11628
Burnside 9689
Edison Park 6272
[77 rows x 1 columns]
7092612
name
NaN 613588
Austin 410333
Near North Side 225363
South Shore 211725
Humboldt Park 204517
... ...
Oakland 14323
Hegewisch 14008
Forest Glen 11628
Burnside 9689
Edison Park 6272
[78 rows x 1 columns]
var = ['name']
freqs = chicagocrime[var].groupby(var).size()
print(freqs.sum(),"\n")
print(freqs)
blanks(1)
print(type(freqs))
blanks(1)
freqs = chicagocrime[var].groupby(var).size().reset_index()
print(freqs.iloc[:,1].sum(),"\n")
print(freqs)
blanks(1)
print(type(freqs))
blanks(2)6479024 name
Albany Park 57799
Archer Heights 23259
Armour Square 24116
Ashburn 59028
Auburn Gresham 183696
...
West Lawn 48320
West Pullman 106090
West Ridge 81138
West Town 190911
Woodlawn 105282
Length: 77, dtype: int64
<class 'pandas.core.series.Series'>
6479024 name 0
0 Albany Park 57799
1 Archer Heights 23259
2 Armour Square 24116
3 Ashburn 59028
4 Auburn Gresham 183696
.. ... ...
72 West Lawn 48320
73 West Pullman 106090
74 West Ridge 81138
75 West Town 190911
76 Woodlawn 105282
[77 rows x 2 columns]
<class 'pandas.core.frame.DataFrame'>
MISSING = math.infvar = ['name','district']freqs = chicagocrime[var].fillna(MISSING).groupby(var).size().reset_index()print(freqs.iloc[:,2].sum())
metadf(freqs,data=True)
blanks(2)
7092612
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 3 columns):
name 416 non-null object
district 416 non-null float64
0 416 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 9.9+ KB
None
name district 0
0 inf 1.0 20007
1 inf 2.0 42898
2 inf 3.0 30632
3 inf 4.0 31962
4 inf 5.0 25770
name district 0
411 West Town inf 4
412 Woodlawn 2.0 1072
413 Woodlawn 3.0 104206
414 Woodlawn 7.0 2
415 Woodlawn inf 2
MISSING = math.infvar = ['name','district']freqs = chicagocrime[var].fillna(MISSING).groupby(var).size().reset_index().replace(MISSING,np.NaN)print(freqs.iloc[:,2].sum(),"\n")
metadf(freqs,data=True)
blanks(2)
7092612 <class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 3 columns):
name 393 non-null object
district 387 non-null float64
0 416 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 9.9+ KB
None
name district 0
0 NaN 1.0 20007
1 NaN 2.0 42898
2 NaN 3.0 30632
3 NaN 4.0 31962
4 NaN 5.0 25770
name district 0
411 West Town NaN 4
412 Woodlawn 2.0 1072
413 Woodlawn 3.0 104206
414 Woodlawn 7.0 2
415 Woodlawn NaN 2
MISSING = math.infvar = ['name','district']freqs = chicagocrime[var].fillna(MISSING).groupby(var).size()\
.reset_index().replace(MISSING,np.NaN)\
.rename(columns={0:'frequency'}).sort_values(['frequency']+var, ascending=False)\
.reset_index(drop=True)
print(freqs.iloc[:,2].sum(),"\n")
metadf(freqs,data=True)
blanks(2)
7092612 <class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 3 columns):
name 393 non-null object
district 387 non-null float64
frequency 416 non-null int64
dtypes: float64(1), int64(1), object(1)
memory usage: 9.9+ KB
None
name district frequency
0 Austin 15.0 281985
1 Near North Side 18.0 225330
2 West Englewood 7.0 188959
3 Englewood 7.0 167079
4 Chicago Lawn 8.0 159647
name district frequency
411 Ashburn 10.0 1
412 Ashburn 7.0 1
413 Archer Heights 31.0 1
414 Albany Park 8.0 1
415 Albany Park NaN 1
MISSING = math.infvar = ['name','district']freqs = chicagocrime[var].fillna(MISSING).groupby(var).size()\
.reset_index().replace(MISSING,np.NaN)\
.rename(columns={0:'frequency'}).sort_values(['frequency']+var, ascending=False)\
.assign(percent=lambda x: 100*(x.frequency/x.frequency.sum())).reset_index(drop=True)
print(freqs.iloc[:,2].sum(),"\n")
metadf(freqs,data=True)
blanks(2)
7092612 <class 'pandas.core.frame.DataFrame'>
RangeIndex: 416 entries, 0 to 415
Data columns (total 4 columns):
name 393 non-null object
district 387 non-null float64
frequency 416 non-null int64
percent 416 non-null float64
dtypes: float64(2), int64(1), object(1)
memory usage: 13.1+ KB
None
name district frequency percent
0 Austin 15.0 281985 3.975757
1 Near North Side 18.0 225330 3.176968
2 West Englewood 7.0 188959 2.664167
3 Englewood 7.0 167079 2.355677
4 Chicago Lawn 8.0 159647 2.250891
name district frequency percent
411 Ashburn 10.0 1 0.000014
412 Ashburn 7.0 1 0.000014
413 Archer Heights 31.0 1 0.000014
414 Albany Park 8.0 1 0.000014
415 Albany Park NaN 1 0.000014
print(freqs.loc[pd.isna(freqs.name)])
blanks(1)
print(freqs.loc[pd.isna(freqs.district)])
blanks(2)name district frequency percent
48 NaN 2.0 42898 0.604827
54 NaN 8.0 39436 0.556015
61 NaN 11.0 35774 0.504384
62 NaN 25.0 35220 0.496573
66 NaN 7.0 33730 0.475565
68 NaN 6.0 32182 0.453740
70 NaN 4.0 31962 0.450638
71 NaN 12.0 31779 0.448058
73 NaN 9.0 30963 0.436553
74 NaN 3.0 30632 0.431886
75 NaN 19.0 29005 0.408947
77 NaN 18.0 28524 0.402165
79 NaN 14.0 28230 0.398020
86 NaN 15.0 25971 0.366170
87 NaN 5.0 25770 0.363336
93 NaN 10.0 24817 0.349899
106 NaN 1.0 20007 0.282082
107 NaN 24.0 19093 0.269196
108 NaN 22.0 19057 0.268688
109 NaN 17.0 18280 0.257733
110 NaN 16.0 17779 0.250669
121 NaN 20.0 12459 0.175662
161 NaN 31.0 20 0.000282
name district frequency percent
185 West Town NaN 4 0.000056
192 Near North Side NaN 4 0.000056
201 Humboldt Park NaN 3 0.000042
206 Austin NaN 3 0.000042
210 Woodlawn NaN 2 0.000028
218 Washington Park NaN 2 0.000028
219 Washington Heights NaN 2 0.000028
222 The Loop NaN 2 0.000028
226 South Lawndale NaN 2 0.000028
236 Lincoln Park NaN 2 0.000028
243 Englewood NaN 2 0.000028
248 Chicago Lawn NaN 2 0.000028
261 West Ridge NaN 1 0.000014
265 West Garfield Park NaN 1 0.000014
269 West Englewood NaN 1 0.000014
289 Roseland NaN 1 0.000014
302 Portage Park NaN 1 0.000014
307 O'Hare NaN 1 0.000014
312 North Center NaN 1 0.000014
318 Near West Side NaN 1 0.000014
329 Lower West Side NaN 1 0.000014
346 Lake View NaN 1 0.000014
356 Hermosa NaN 1 0.000014
358 Greater Grand Crossing NaN 1 0.000014
375 Dunning NaN 1 0.000014
397 Belmont Cragin NaN 1 0.000014
402 Avalon Park NaN 1 0.000014
409 Auburn Gresham NaN 1 0.000014
415 Albany Park NaN 1 0.000014
def freqsdf(df,var):
"""
(df - pandas dataframe; var - list of dataframe columns)
"""

MISSING = math.inf

if len(var) != len(set(var)):
print("duplicate columns\n")
return(None)

if not all(v in list(df.columns) for v in var):
print("invalid column(s)")
return(None)

if len([str(v) for v in df[var].dtypes if str(v)=="category"]) > 0:
print("invalid category column(s)")
return(None)

freqs = df[var].fillna(MISSING).groupby(var).size().reset_index().replace(MISSING,np.NaN)\
.rename(columns={0:'frequency'}).sort_values(['frequency']+var, ascending=False)\
.assign(percent=lambda x: 100*(x.frequency/x.frequency.sum())).reset_index(drop=True)

return(freqs)
var = ['fbicode','fbicodedesc']
freqs = freqsdf(chicagocrime,var)
print(freqs.frequency.sum(),"\n")
metadf(freqs,data=True)
blanks(2)7092612 <class 'pandas.core.frame.DataFrame'>
RangeIndex: 26 entries, 0 to 25
Data columns (total 4 columns):
fbicode 26 non-null object
fbicodedesc 26 non-null object
frequency 26 non-null int64
percent 26 non-null float64
dtypes: float64(1), int64(1), object(2)
memory usage: 960.0+ bytes
None
fbicode fbicodedesc frequency percent
0 06 Larceny 1500365 21.153913
1 08B Simple Battery 1108831 15.633606
2 14 Vandalism 806114 11.365545
3 26 Misc Non-Index Offense 716041 10.095590
4 18 Drug Abuse 686916 9.684951
fbicode fbicodedesc frequency percent
21 09 Arson 11535 0.162634
22 01A Homicide 1st & 2nd Degree 10084 0.142176
23 13 Stolen Property 2839 0.040028
24 12 Embezzlement 1475 0.020796
25 01B Involuntary Manslaughter 51 0.000719
start = time.time()var = ['latitude','longitude']
freqs = freqsdf(chicagocrime,var)
print(freqs.frequency.sum(),"\n")
metadf(freqs,data=True)
blanks(1)
end = time.time()
print(end - start)
blanks(2)
7092612 <class 'pandas.core.frame.DataFrame'>
RangeIndex: 867302 entries, 0 to 867301
Data columns (total 4 columns):
latitude 867301 non-null float64
longitude 867301 non-null float64
frequency 867302 non-null int64
percent 867302 non-null float64
dtypes: float64(3), int64(1)
memory usage: 26.5 MB
None
latitude longitude frequency percent
0 NaN NaN 68077 0.959830
1 41.976292 -87.905228 13518 0.190593
2 41.754593 -87.741531 9813 0.138355
3 41.883499 -87.627876 7655 0.107929
4 41.897896 -87.624100 4663 0.065744
latitude longitude frequency percent
867297 41.644604 -87.609550 1 0.000014
867298 41.644604 -87.610893 1 0.000014
867299 41.644604 -87.611778 1 0.000014
867300 41.644600 -87.597176 1 0.000014
867301 41.644588 -87.616081 1 0.000014
2.6000795364379883
hcde = ["01A"]vcde = list(flatten([hcde,"01B","02","03","04A","04B"]))pcde = ["05","06","07","09"]icde = list(flatten([vcde,pcde]))
chicagocrime.eval("""
hbool = fbicode in @hcde
vbool = fbicode in @vcde
pbool = fbicode in @pcde
ibool = fbicode in @icde
""",inplace=True)
metadf(chicagocrime)blanks(2)<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7092612 entries, 0 to 7092611
Data columns (total 31 columns):
id int32
casenumber object
date datetime64[ns, UTC]
block object
iucr object
primarytype object
description object
locationdescription object
arrest bool
domestic bool
beat int16
district float32
ward float32
communityarea float32
fbicode object
xcoordinate float32
ycoordinate float32
year int16
updatedon datetime64[ns, UTC]
latitude float32
longitude float32
fbicodedesc object
name object
population float32
month int8
day int8
hour int8
hbool bool
vbool bool
pbool bool
ibool bool
dtypes: bool(6), datetime64[ns, UTC](2), float32(8), int16(2), int32(1), int8(3), object(9)
memory usage: 926.7+ MB
None
var = ['name','district','fbicodedesc']
freqs = freqsdf(chicagocrime,var)

print(freqs.frequency.sum(),"\n")
metadf(freqs,data=True)

blanks(2)
print(freqs.frequency.sum(),"\n")
metadf(freqs,data=True)
blanks(2)7092612



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4211 entries, 0 to 4210
Data columns (total 5 columns):
name 3655 non-null object
district 4165 non-null float64
fbicodedesc 4211 non-null object
frequency 4211 non-null int64
percent 4211 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 164.6+ KB
None


name district fbicodedesc frequency percent
0 Near North Side 18.0 Larceny 94479 1.332076
1 The Loop 1.0 Larceny 82013 1.156316
2 Austin 15.0 Drug Abuse 65297 0.920634
3 Austin 15.0 Simple Battery 52223 0.736301
4 Lake View 19.0 Larceny 48218 0.679834


name district fbicodedesc frequency percent
4206 NaN 31.0 Simple Assault 1 0.000014
4207 NaN 31.0 Drug Abuse 1 0.000014
4208 NaN 20.0 Embezzlement 1 0.000014
4209 NaN 10.0 Embezzlement 1 0.000014
4210 NaN 5.0 Embezzlement 1 0.000014
var = ['name','district','fbicodedesc']
freqs = freqsdf(chicagocrime.query('fbicode in @hcde'),var)

print(freqs.frequency.sum(),"\n")
metadf(freqs,data=True)

blanks(2)
10084



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 5 columns):
name 119 non-null object
district 140 non-null float64
fbicodedesc 140 non-null object
frequency 140 non-null int64
percent 140 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 5.6+ KB
None


name district fbicodedesc frequency percent
0 Austin 15.0 Homicide 1st & 2nd Degree 600 5.950020
1 West Englewood 7.0 Homicide 1st & 2nd Degree 419 4.155097
2 Englewood 7.0 Homicide 1st & 2nd Degree 413 4.095597
3 North Lawndale 10.0 Homicide 1st & 2nd Degree 364 3.609679
4 New City 9.0 Homicide 1st & 2nd Degree 329 3.262594


name district fbicodedesc frequency percent
135 Fuller Park 2.0 Homicide 1st & 2nd Degree 1 0.009917
136 Forest Glen 17.0 Homicide 1st & 2nd Degree 1 0.009917
137 Englewood 3.0 Homicide 1st & 2nd Degree 1 0.009917
138 Chatham 4.0 Homicide 1st & 2nd Degree 1 0.009917
139 Armour Square 2.0 Homicide 1st & 2nd Degree 1 0.009917
var = ['name','district','fbicodedesc']
freqs = freqsdf(chicagocrime.query('hbool'),var)

print(freqs.frequency.sum(),"\n")
metadf(freqs,data=True)

blanks(2)
10084



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 140 entries, 0 to 139
Data columns (total 5 columns):
name 119 non-null object
district 140 non-null float64
fbicodedesc 140 non-null object
frequency 140 non-null int64
percent 140 non-null float64
dtypes: float64(2), int64(1), object(2)
memory usage: 5.6+ KB
None


name district fbicodedesc frequency percent
0 Austin 15.0 Homicide 1st & 2nd Degree 600 5.950020
1 West Englewood 7.0 Homicide 1st & 2nd Degree 419 4.155097
2 Englewood 7.0 Homicide 1st & 2nd Degree 413 4.095597
3 North Lawndale 10.0 Homicide 1st & 2nd Degree 364 3.609679
4 New City 9.0 Homicide 1st & 2nd Degree 329 3.262594


name district fbicodedesc frequency percent
135 Fuller Park 2.0 Homicide 1st & 2nd Degree 1 0.009917
136 Forest Glen 17.0 Homicide 1st & 2nd Degree 1 0.009917
137 Englewood 3.0 Homicide 1st & 2nd Degree 1 0.009917
138 Chatham 4.0 Homicide 1st & 2nd Degree 1 0.009917
139 Armour Square 2.0 Homicide 1st & 2nd Degree 1 0.009917
var = ['name','district','fbicodedesc','year']
freqs = freqsdf(chicagocrime.query('vbool and 2015<year<2020'),var)

print(freqs.frequency.sum(),"\n")
metadf(freqs,data=True)

blanks(2)
106179



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2264 entries, 0 to 2263
Data columns (total 6 columns):
name 2263 non-null object
district 2264 non-null float64
fbicodedesc 2264 non-null object
year 2264 non-null int64
frequency 2264 non-null int64
percent 2264 non-null float64
dtypes: float64(2), int64(2), object(2)
memory usage: 106.2+ KB
None


name district fbicodedesc year frequency percent
0 Austin 15.0 Robbery 2016 596 0.561316
1 Austin 15.0 Robbery 2017 589 0.554724
2 Austin 15.0 Aggravated Battery 2016 528 0.497273
3 Austin 15.0 Robbery 2018 522 0.491623
4 Austin 15.0 Aggravated Battery 2019 467 0.439823


name district fbicodedesc year frequency \
2259 Armour Square 1.0 Aggravated Assault 2017 1
2260 Archer Heights 8.0 Homicide 1st & 2nd Degree 2018 1
2261 Albany Park 17.0 Homicide 1st & 2nd Degree 2019 1
2262 Albany Park 17.0 Homicide 1st & 2nd Degree 2017 1
2263 NaN 12.0 Criminal Sexual Assault 2018 1

percent
2259 0.000942
2260 0.000942
2261 0.000942
2262 0.000942
2263 0.000942
def mkseason(n):
if 3<=n<=5: season = 'spring'
elif 6<=n<=8: season = 'summer'
elif 9<=n<=11: season = 'fall'
elif n<=2 or n==12: season = 'winter'
else: season = 'unknown'
return(season)

chicagocrime['season'] = chicagocrime['month'].apply(mkseason)


var = ['name','district','fbicodedesc','season']
freqs = freqsdf(chicagocrime.query('vbool'),var)

print(freqs.frequency.sum(),"\n")
metadf(freqs,data=True)

chicagocrime.drop(['season'], axis=1)

blanks(2)
614873



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3012 entries, 0 to 3011
Data columns (total 6 columns):
name 2581 non-null object
district 3004 non-null float64
fbicodedesc 3012 non-null object
season 3012 non-null object
frequency 3012 non-null int64
percent 3012 non-null float64
dtypes: float64(2), int64(1), object(3)
memory usage: 141.3+ KB
None


name district fbicodedesc season frequency percent
0 Austin 15.0 Robbery fall 3122 0.507747
1 Austin 15.0 Robbery summer 3092 0.502868
2 Austin 15.0 Robbery winter 2898 0.471317
3 West Englewood 7.0 Aggravated Battery summer 2659 0.432447
4 Austin 15.0 Aggravated Battery summer 2598 0.422526


name district fbicodedesc season frequency percent
3007 NaN 18.0 Homicide 1st & 2nd Degree fall 1 0.000163
3008 NaN 17.0 Homicide 1st & 2nd Degree winter 1 0.000163
3009 NaN 17.0 Homicide 1st & 2nd Degree fall 1 0.000163
3010 NaN 1.0 Homicide 1st & 2nd Degree spring 1 0.000163
3011 NaN 1.0 Homicide 1st & 2nd Degree fall 1 0.000163

--

--

Get the Medium app

A button that says 'Download on the App Store', and if clicked it will lead you to the iOS App store
A button that says 'Get it on, Google Play', and if clicked it will lead you to the Google Play store
ODSC - Open Data Science

ODSC - Open Data Science

94K Followers

Our passion is bringing thousands of the best and brightest data scientists together under one roof for an incredible learning and networking experience.