In [1]:
import os, glob, getpass
import pandas as pd
import fiona
import geopandas as gpd
username = getpass.getuser()

## Set up input/output folders

In [2]:
geojson_dir = os.path.join('/Users',
                          username,
                          'Box',
                          'Dataviz Projects',
                          'Spatial Analysis and Mapping',
                          'Active Transportation Plan',
                          'Data',
                          'geojson')
match_dir = os.path.join('/Users',
                        username,
                        'Box',
                        'DataViz Projects',
                        'Spatial Analysis and Mapping',
                        'Active Transportation Plan',
                        'Data',
                        'shst_match_results',
                        'matched')
output_dir = os.path.join('/Users',
                        username,
                        'Box',
                        'DataViz Projects',
                        'Spatial Analysis and Mapping',
                        'Active Transportation Plan',
                        'Data',
                        'final_nw_datasets')

In [3]:
os.listdir(match_dir)

['actc_bike_network_epsg4326_exst_matched.geojson',
 'actc_bike_network_epsg4326_ppsd_matched.geojson',
 'batc_bike_network_v2_epsg4326_exst_matched.geojson',
 'batc_bike_network_v2_epsg4326_ppsd_matched.geojson',
 'caltrans_d4_bike_network_epsg4326_ppsd_matched.geojson',
 'ccag_bike_network_epsg4326_exst_matched.geojson',
 'ccag_bike_network_epsg4326_ppsd_matched.geojson',
 'ccta_bike_network_epsg4326_exst_matched.geojson',
 'ccta_bike_network_epsg4326_ppsd_matched.geojson',
 'nvta_bike_network_epsg4326_exst_matched.geojson',
 'nvta_bike_network_epsg4326_ppsd_matched.geojson',
 'oakland_bike_network_epsg4326_exst_matched.geojson',
 'oakland_bike_network_epsg4326_ppsd_matched.geojson',
 'san_jose_bike_nw_epsg4326_exst_matched.geojson',
 'san_jose_bike_nw_epsg4326_ppsd_matched.geojson',
 'scta_bike_network_epsg4326_exst_matched.geojson',
 'scta_bike_network_epsg4326_ppsd_matched.geojson',
 'sfcta_bike_network_epsg4326_exst_matched.geojson',
 'sta_bike_network_epsg4326_exst_matched.geojs

## Functions

In [30]:
def merge_deduplicate_matched_nw(match_dir,in_file):
    """
    Merge and deduplicate matched exising and proposed bike network datasets. 
    
    Return dataframe. 
    """
    
    match_files = glob.glob(match_dir + '/' + in_file + '?*.geojson')
    
    col_name_dict = {
        'san_jose_bike_nw_epsg4326': 'sj',
        'vta_bike_network_v2_epsg4326': 'cma',
        'batc_bike_network_v2_epsg4326': 'batc',
        'caltrans_d4_bike_network_epsg4326': 'caltrans',
        'oakland_bike_network_epsg4326': 'oak',
        'tam_bike_network_epsg4326': 'cma',
        'sta_bike_network_epsg4326': 'cma',
        'sfcta_bike_network_epsg4326': 'cma',
        'scta_bike_network_epsg4326': 'cma',
        'nvta_bike_network_epsg4326': 'cma',
        'ccta_bike_network_epsg4326': 'cma',
        'ccag_bike_network_epsg4326': 'cma',
        'actc_bike_network_epsg4326': 'cma'
    }
    
    print('----------Start reading shst match results data-------------')
    
    concat_gdf = gpd.GeoDataFrame()
    for file in match_files:
        print('Reading shst match results data: ' + file)
        gdf = gpd.read_file(file)
        concat_gdf = pd.concat([concat_gdf,gdf],
                               ignore_index=True,
                               sort=False)
        
    print('----------Finished reading shst match results data----------')
    
    print('\n-----------Renaming columns-------------------------------')
    
    rename_dict = {
        'pp_ex_class':col_name_dict[in_file] + '_ex_class',
        'pp_pln_class':col_name_dict[in_file] + '_pln_class',
        'source':col_name_dict[in_file] + '_source',
        'pp_mtc_facility_id':'mtc_facility_id'
    }
    
    concat_gdf.rename(columns=rename_dict,
                      inplace=True)

    print('\n-----------Reprojecting-----------------------------------')
    
    concat_gdf.to_crs('EPSG:3857',inplace=True)
    
    print('\n-----------Adding link length column----------------------')
    
    concat_gdf['length'] = concat_gdf['geometry'].length
    
    print('\n--Sorting by length decending, drop duplicates by subset--')
    print('\nCount of records: ',concat_gdf.shape[0])
    
    subset = [
        'shstReferenceId',
        'shstGeometryId'
    ]
    dedup_gdf = (
        concat_gdf
        .sort_values('length',ascending=False)
        .drop_duplicates(subset=subset,keep='first')
        .copy()
    )
    
    print('\nFinal count of deduped records: ',dedup_gdf.shape[0])
    
    col_subset = [
        'shstReferenceId',
        'shstGeometryId',
        'fromIntersectionId',
        'toIntersectionId',
        col_name_dict[in_file] + '_ex_class',
        col_name_dict[in_file] + '_pln_class',
        col_name_dict[in_file] + '_source',
        'mtc_facility_id'
    ]
    return dedup_gdf[col_subset]

In [31]:
def compare_matched_raw_nw(matched_df,raw_dir,in_file):
    """
    Compare matched bike network to original raw input and return unmatched as gdf.
    """
    print('-----------Reading raw files------------------------------')
    
    raw_file = os.path.join(raw_dir,in_file +'.geojson')
    
    raw_gdf = gpd.read_file(raw_file)
    
    raw_cols = raw_gdf.columns.to_list()
    
    print('\n-----------Merging matched and raw files------------------')
    
    raw_match_merge = pd.merge(raw_gdf,
                           matched_df,
                           how='left',
                           on='mtc_facility_id',
                           indicator=True)
    
    print('\n-----------Creating new unmatched dataframe---------------')
    
    unmatched_gdf = raw_match_merge[raw_match_merge['_merge'] == 'left_only'].copy()
    
    print('\nUnmatched records: ',unmatched_gdf.shape[0])
    
    return unmatched_gdf[raw_cols]

In [32]:
def batch_post_process_nw_datasets(file_list,match_dir,raw_dir,output_dir):
    for file in file_list:
        
        print('Merging and dedupliating existing and proposed matched network...')
        
        dedup_df = merge_deduplicate_matched_nw(match_dir=match_dir,in_file=file)
        
        print('\nMerging complete.')
        print('\nCompare matched network to original raw network...')
        
        unmatched_gdf = compare_matched_raw_nw(matched_df=dedup_df,raw_dir=raw_dir,in_file=file)
        
        print('\nComparison complete.')
        
        print('\nExporting merged matched json and unmatched geojson...')
        
        out_file_nm = file.replace('_epsg4326','')
        out_file_path = os.path.join(output_dir,'matched',out_file_nm +'_matched.json')
        dedup_df.to_json(out_file_path)
        
        out_unmatched_path = os.path.join(output_dir,'unmatched',file + '_unmatched.geojson')
        unmatched_gdf.to_file(out_unmatched_path,driver='GeoJSON')
        
        print('\nExported matched to: ',out_file_path)
        print('\nExported unmatched to: ',out_unmatched_path)

## Batch process existing and proposed matched network datasets

1. Merge existing and proposed datasets and create output .json file
2. Compare matched with raw input data to create an unmatched .geojson output file

In [33]:
data_list = [
    'san_jose_bike_nw_epsg4326',
    'vta_bike_network_v2_epsg4326',
    'batc_bike_network_v2_epsg4326',
    'caltrans_d4_bike_network_epsg4326',
    'oakland_bike_network_epsg4326',
    'tam_bike_network_epsg4326',
    'sta_bike_network_epsg4326',
    'sfcta_bike_network_epsg4326',
    'scta_bike_network_epsg4326',
    'nvta_bike_network_epsg4326',
    'ccta_bike_network_epsg4326',
    'ccag_bike_network_epsg4326',
    'actc_bike_network_epsg4326'
]

In [34]:
batch_post_process_nw_datasets(file_list=data_list,
                               match_dir=match_dir,
                               raw_dir=geojson_dir,
                               output_dir=output_dir)

Merging and dedupliating existing and proposed matched network...
----------Start reading shst match results data-------------
Reading shst match results data: /Users/jcroff/Box/DataViz Projects/Spatial Analysis and Mapping/Active Transportation Plan/Data/shst_match_results/matched/san_jose_bike_nw_epsg4326_exst_matched.geojson
Reading shst match results data: /Users/jcroff/Box/DataViz Projects/Spatial Analysis and Mapping/Active Transportation Plan/Data/shst_match_results/matched/san_jose_bike_nw_epsg4326_ppsd_matched.geojson
----------Finished reading shst match results data----------

-----------Renaming columns-------------------------------

-----------Reprojecting-----------------------------------

-----------Adding link length column----------------------

--Sorting by length decending, drop duplicates by subset--

Count of records:  26770

Final count of deduped records:  25459

Merging complete.

Compare matched network to original raw network...
-----------Reading raw files-

Reading shst match results data: /Users/jcroff/Box/DataViz Projects/Spatial Analysis and Mapping/Active Transportation Plan/Data/shst_match_results/matched/tam_bike_network_epsg4326_ppsd_matched.geojson
----------Finished reading shst match results data----------

-----------Renaming columns-------------------------------

-----------Reprojecting-----------------------------------

-----------Adding link length column----------------------

--Sorting by length decending, drop duplicates by subset--

Count of records:  9996

Final count of deduped records:  8364

Merging complete.

Compare matched network to original raw network...
-----------Reading raw files------------------------------

-----------Merging matched and raw files------------------

-----------Creating new unmatched dataframe---------------

Unmatched records:  195

Comparison complete.

Exporting merged matched json and unmatched geojson...

Exported matched to:  /Users/jcroff/Box/DataViz Projects/Spatial Analysis and 


-----------Adding link length column----------------------

--Sorting by length decending, drop duplicates by subset--

Count of records:  27650

Final count of deduped records:  24969

Merging complete.

Compare matched network to original raw network...
-----------Reading raw files------------------------------

-----------Merging matched and raw files------------------

-----------Creating new unmatched dataframe---------------

Unmatched records:  418

Comparison complete.

Exporting merged matched json and unmatched geojson...

Exported matched to:  /Users/jcroff/Box/DataViz Projects/Spatial Analysis and Mapping/Active Transportation Plan/Data/final_nw_datasets/matched/ccta_bike_network_matched.json

Exported unmatched to:  /Users/jcroff/Box/DataViz Projects/Spatial Analysis and Mapping/Active Transportation Plan/Data/final_nw_datasets/unmatched/ccta_bike_network_epsg4326_unmatched.geojson
Merging and dedupliating existing and proposed matched network...
----------Start reading s