Preprocessing

OSRM requires OSM data to be preprocessed before routing. Two algorithms are supported:

CH (Contraction Hierarchies) — fastest query times; use extract → contract
MLD (Multi-Level Dijkstra) — better for large networks and dynamic weights; use extract → partition → customize

extract

`extract(input_path, profile='car', output_path=None, threads=None, verbosity='INFO', progress_callback=None, capture_output=False, **kwargs)`

Extract OSM data into OSRM format.

This is the first step in the OSRM preprocessing pipeline. It parses OSM data (from .osm, .osm.bz2, or .osm.pbf files) and applies a Lua routing profile to generate graph data for routing.

Parameters:

Name	Type	Description	Default
`input_path`	`str`	Path to input OSM file (.osm, .osm.bz2, or .osm.pbf)	required
`profile`	`str`	Profile name ('car', 'bicycle', 'foot') or path to custom .lua file	`'car'`
`output_path`	`Optional[str]`	Base path for output files. If None, uses input_path base name.	`None`
`threads`	`Optional[int]`	Number of threads to use. If None, uses all available CPU cores.	`None`
`verbosity`	`str`	Log level - one of: "NONE", "ERROR", "WARNING", "INFO", "DEBUG"	`'INFO'`
`progress_callback`	`Optional[Callable[[str], None]]`	Optional callback function(line: str) for progress updates. Called with each log line. Keep lightweight to avoid slowing down the extraction process.	`None`
`capture_output`	`bool`	If True, capture stdout/stderr and return in result dict. If False and no progress_callback, output goes directly to console.	`False`
`**kwargs`		Additional ExtractorConfig parameters (small_component_size, use_metadata, parse_conditionals, etc.)	`{}`

Returns:

Type	Description
`Dict[str, Any]`	Dict with keys: - success: bool - Whether extraction succeeded - duration: float - Time taken in seconds - stdout: str - Captured stdout (if capture_output=True or progress_callback set) - stderr: str - Captured stderr (if capture_output=True or progress_callback set) - error: str - Error message (if success=False)

Example

import osrm

Simple extraction with bundled profile

result = osrm.extract('data.osm.pbf', profile='car')

With different profile

result = osrm.extract('data.osm.pbf', profile='bicycle')

With custom profile

result = osrm.extract('data.osm.pbf', profile='path/to/custom.lua')

With progress callback

def show_progress(line): ... print(f"Progress: {line}") result = osrm.extract('data.osm.pbf', progress_callback=show_progress) print(f"Completed in {result['duration']:.2f}s")

With custom settings

result = osrm.extract( ... 'data.osm.pbf', ... profile='bicycle', ... threads=4, ... small_component_size=500, ... use_metadata=True ... )

Source code in osrm/preprocessing.py

def extract(
    input_path: str,
    profile: str = "car",
    output_path: Optional[str] = None,
    threads: Optional[int] = None,
    verbosity: str = "INFO",
    progress_callback: Optional[Callable[[str], None]] = None,
    capture_output: bool = False,
    **kwargs
) -> Dict[str, Any]:
    """
    Extract OSM data into OSRM format.

    This is the first step in the OSRM preprocessing pipeline. It parses OSM data
    (from .osm, .osm.bz2, or .osm.pbf files) and applies a Lua routing profile to
    generate graph data for routing.

    Args:
        input_path: Path to input OSM file (.osm, .osm.bz2, or .osm.pbf)
        profile: Profile name ('car', 'bicycle', 'foot') or path to custom .lua file
        output_path: Base path for output files. If None, uses input_path base name.
        threads: Number of threads to use. If None, uses all available CPU cores.
        verbosity: Log level - one of: "NONE", "ERROR", "WARNING", "INFO", "DEBUG"
        progress_callback: Optional callback function(line: str) for progress updates.
                          Called with each log line. Keep lightweight to avoid slowing
                          down the extraction process.
        capture_output: If True, capture stdout/stderr and return in result dict.
                       If False and no progress_callback, output goes directly to console.
        **kwargs: Additional ExtractorConfig parameters (small_component_size,
                 use_metadata, parse_conditionals, etc.)

    Returns:
        Dict with keys:
            - success: bool - Whether extraction succeeded
            - duration: float - Time taken in seconds
            - stdout: str - Captured stdout (if capture_output=True or progress_callback set)
            - stderr: str - Captured stderr (if capture_output=True or progress_callback set)
            - error: str - Error message (if success=False)

    Example:
        >>> import osrm
        >>> 
        >>> # Simple extraction with bundled profile
        >>> result = osrm.extract('data.osm.pbf', profile='car')
        >>> 
        >>> # With different profile
        >>> result = osrm.extract('data.osm.pbf', profile='bicycle')
        >>> 
        >>> # With custom profile
        >>> result = osrm.extract('data.osm.pbf', profile='path/to/custom.lua')
        >>> 
        >>> # With progress callback
        >>> def show_progress(line):
        ...     print(f"Progress: {line}")
        >>> result = osrm.extract('data.osm.pbf', progress_callback=show_progress)
        >>> print(f"Completed in {result['duration']:.2f}s")
        >>> 
        >>> # With custom settings
        >>> result = osrm.extract(
        ...     'data.osm.pbf',
        ...     profile='bicycle',
        ...     threads=4,
        ...     small_component_size=500,
        ...     use_metadata=True
        ... )
    """
    # Create config
    config = osrm_ext.ExtractorConfig()
    config.input_path = Path(input_path)

    # Resolve profile path - convert to absolute string for C++ binding
    profile_path = _resolve_profile(profile)
    config.profile_path = str(profile_path.absolute())

    # Set output path
    if output_path:
        config.UseDefaultOutputNames(Path(output_path))
    else:
        config.UseDefaultOutputNames(Path(input_path))

    # Set threads
    if threads is not None:
        config.requested_num_threads = threads
    else:
        # Default to CPU count to avoid TBB issues with 0 threads
        import os
        config.requested_num_threads = os.cpu_count() or 1

    # Apply additional kwargs
    for key, value in kwargs.items():
        if hasattr(config, key):
            setattr(config, key, value)

    # Execute
    if capture_output or progress_callback is not None:
        return osrm_ext.extract_with_capture(config, verbosity, progress_callback)
    else:
        osrm_ext.extract(config, verbosity)
        return {"success": True}

contract

`contract(input_path, threads=None, verbosity='INFO', progress_callback=None, capture_output=False, **kwargs)`

Run contraction hierarchy computation for CH algorithm.

This is used for the CH (Contraction Hierarchies) routing algorithm. Run this after extract() if you plan to use CH routing.

For MLD algorithm, use partition() and customize() instead.

Parameters:

Name	Type	Description	Default
`input_path`	`str`	Base path to .osrm files (output from extract)	required
`threads`	`Optional[int]`	Number of threads to use. If None, uses all available CPU cores.	`None`
`verbosity`	`str`	Log level - one of: "NONE", "ERROR", "WARNING", "INFO", "DEBUG"	`'INFO'`
`progress_callback`	`Optional[Callable[[str], None]]`	Optional callback function(line: str) for progress updates	`None`
`capture_output`	`bool`	If True, capture and return stdout/stderr	`False`
`**kwargs`		Additional ContractorConfig parameters	`{}`

Returns:

Type	Description
`Dict[str, Any]`	Dict with success, duration, stdout, stderr, and optionally error keys

Example

import osrm

First extract

osrm.extract('data.osm.pbf', profile_path='profiles/car.lua')

Then contract

result = osrm.contract('data.osrm') print(f"Contraction completed in {result['duration']:.2f}s")

Source code in osrm/preprocessing.py

def contract(
    input_path: str,
    threads: Optional[int] = None,
    verbosity: str = "INFO",
    progress_callback: Optional[Callable[[str], None]] = None,
    capture_output: bool = False,
    **kwargs
) -> Dict[str, Any]:
    """
    Run contraction hierarchy computation for CH algorithm.

    This is used for the CH (Contraction Hierarchies) routing algorithm.
    Run this after extract() if you plan to use CH routing.

    For MLD algorithm, use partition() and customize() instead.

    Args:
        input_path: Base path to .osrm files (output from extract)
        threads: Number of threads to use. If None, uses all available CPU cores.
        verbosity: Log level - one of: "NONE", "ERROR", "WARNING", "INFO", "DEBUG"
        progress_callback: Optional callback function(line: str) for progress updates
        capture_output: If True, capture and return stdout/stderr
        **kwargs: Additional ContractorConfig parameters

    Returns:
        Dict with success, duration, stdout, stderr, and optionally error keys

    Example:
        >>> import osrm
        >>> 
        >>> # First extract
        >>> osrm.extract('data.osm.pbf', profile_path='profiles/car.lua')
        >>> 
        >>> # Then contract
        >>> result = osrm.contract('data.osrm')
        >>> print(f"Contraction completed in {result['duration']:.2f}s")
    """
    config = osrm_ext.ContractorConfig()
    config.UseDefaultOutputNames(Path(input_path))

    if threads is not None:
        config.requested_num_threads = threads
    else:
        # Default to CPU count to avoid TBB issues with 0 threads
        import os
        config.requested_num_threads = os.cpu_count() or 1

    for key, value in kwargs.items():
        if hasattr(config, key):
            setattr(config, key, value)

    if capture_output or progress_callback is not None:
        return osrm_ext.contract_with_capture(config, verbosity, progress_callback)
    else:
        osrm_ext.contract(config, verbosity)
        return {"success": True}

partition

`partition(input_path, threads=None, verbosity='INFO', progress_callback=None, capture_output=False, **kwargs)`

Partition graph for MLD (Multi-Level Dijkstra) algorithm.

This is the second step for MLD routing (after extract). Follow with customize() to complete the MLD preprocessing pipeline.

Parameters:

Name	Type	Description	Default
`input_path`	`str`	Base path to .osrm files (output from extract)	required
`threads`	`Optional[int]`	Number of threads to use. If None, uses all available CPU cores.	`None`
`verbosity`	`str`	Log level - one of: "NONE", "ERROR", "WARNING", "INFO", "DEBUG"	`'INFO'`
`progress_callback`	`Optional[Callable[[str], None]]`	Optional callback function(line: str) for progress updates	`None`
`capture_output`	`bool`	If True, capture and return stdout/stderr	`False`
`**kwargs`		Additional PartitionerConfig parameters (balance, boundary_factor, num_optimizing_cuts, small_component_size, max_cell_sizes)	`{}`

Returns:

Type	Description
`Dict[str, Any]`	Dict with success, duration, stdout, stderr, and optionally error keys

Example

import osrm

First extract

osrm.extract('data.osm.pbf', profile_path='profiles/car.lua')

Then partition for MLD

result = osrm.partition('data.osrm')

Finally customize

osrm.customize('data.osrm')

Source code in osrm/preprocessing.py

def partition(
    input_path: str,
    threads: Optional[int] = None,
    verbosity: str = "INFO",
    progress_callback: Optional[Callable[[str], None]] = None,
    capture_output: bool = False,
    **kwargs
) -> Dict[str, Any]:
    """
    Partition graph for MLD (Multi-Level Dijkstra) algorithm.

    This is the second step for MLD routing (after extract).
    Follow with customize() to complete the MLD preprocessing pipeline.

    Args:
        input_path: Base path to .osrm files (output from extract)
        threads: Number of threads to use. If None, uses all available CPU cores.
        verbosity: Log level - one of: "NONE", "ERROR", "WARNING", "INFO", "DEBUG"
        progress_callback: Optional callback function(line: str) for progress updates
        capture_output: If True, capture and return stdout/stderr
        **kwargs: Additional PartitionerConfig parameters (balance, boundary_factor,
                 num_optimizing_cuts, small_component_size, max_cell_sizes)

    Returns:
        Dict with success, duration, stdout, stderr, and optionally error keys

    Example:
        >>> import osrm
        >>> 
        >>> # First extract
        >>> osrm.extract('data.osm.pbf', profile_path='profiles/car.lua')
        >>> 
        >>> # Then partition for MLD
        >>> result = osrm.partition('data.osrm')
        >>> 
        >>> # Finally customize
        >>> osrm.customize('data.osrm')
    """
    config = osrm_ext.PartitionerConfig()
    config.UseDefaultOutputNames(Path(input_path))

    if threads is not None:
        config.requested_num_threads = threads
    else:
        # Default to CPU count to avoid TBB issues with 0 threads
        import os
        config.requested_num_threads = os.cpu_count() or 1

    for key, value in kwargs.items():
        if hasattr(config, key):
            setattr(config, key, value)

    if capture_output or progress_callback is not None:
        return osrm_ext.partition_with_capture(config, verbosity, progress_callback)
    else:
        osrm_ext.partition(config, verbosity)
        return {"success": True}

customize

`customize(input_path, threads=None, verbosity='INFO', progress_callback=None, capture_output=False, **kwargs)`

Customize partitioned graph for MLD (Multi-Level Dijkstra) algorithm.

This is the final step for MLD routing (after extract and partition).

Parameters:

Name	Type	Description	Default
`input_path`	`str`	Base path to .osrm files (output from partition)	required
`threads`	`Optional[int]`	Number of threads to use. If None, uses all available CPU cores.	`None`
`verbosity`	`str`	Log level - one of: "NONE", "ERROR", "WARNING", "INFO", "DEBUG"	`'INFO'`
`progress_callback`	`Optional[Callable[[str], None]]`	Optional callback function(line: str) for progress updates	`None`
`capture_output`	`bool`	If True, capture and return stdout/stderr	`False`
`**kwargs`		Additional CustomizationConfig parameters	`{}`

Returns:

Type	Description
`Dict[str, Any]`	Dict with success, duration, stdout, stderr, and optionally error keys

Example

import osrm

Complete MLD pipeline

osrm.extract('data.osm.pbf', profile_path='profiles/car.lua') osrm.partition('data.osrm') result = osrm.customize('data.osrm')

Now ready for routing with MLD

engine = osrm.OSRM('data.osrm', algorithm='MLD')

Source code in osrm/preprocessing.py

def customize(
    input_path: str,
    threads: Optional[int] = None,
    verbosity: str = "INFO",
    progress_callback: Optional[Callable[[str], None]] = None,
    capture_output: bool = False,
    **kwargs
) -> Dict[str, Any]:
    """
    Customize partitioned graph for MLD (Multi-Level Dijkstra) algorithm.

    This is the final step for MLD routing (after extract and partition).

    Args:
        input_path: Base path to .osrm files (output from partition)
        threads: Number of threads to use. If None, uses all available CPU cores.
        verbosity: Log level - one of: "NONE", "ERROR", "WARNING", "INFO", "DEBUG"
        progress_callback: Optional callback function(line: str) for progress updates
        capture_output: If True, capture and return stdout/stderr
        **kwargs: Additional CustomizationConfig parameters

    Returns:
        Dict with success, duration, stdout, stderr, and optionally error keys

    Example:
        >>> import osrm
        >>> 
        >>> # Complete MLD pipeline
        >>> osrm.extract('data.osm.pbf', profile_path='profiles/car.lua')
        >>> osrm.partition('data.osrm')
        >>> result = osrm.customize('data.osrm')
        >>> 
        >>> # Now ready for routing with MLD
        >>> engine = osrm.OSRM('data.osrm', algorithm='MLD')
    """
    config = osrm_ext.CustomizationConfig()
    config.UseDefaultOutputNames(Path(input_path))

    if threads is not None:
        config.requested_num_threads = threads
    else:
        # Default to CPU count to avoid TBB issues with 0 threads
        import os
        config.requested_num_threads = os.cpu_count() or 1

    for key, value in kwargs.items():
        if hasattr(config, key):
            setattr(config, key, value)

    if capture_output or progress_callback is not None:
        return osrm_ext.customize_with_capture(config, verbosity, progress_callback)
    else:
        osrm_ext.customize(config, verbosity)
        return {"success": True}